# necessary libraries
library(readr)
library(plyr)
## Warning: package 'plyr' was built under R version 4.3.1
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(tidyr)
library(stringr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
options(repos = c(CRAN = "https://cran.rstudio.com/"))
install.packages("fastmap")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'fastmap' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'fastmap'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Latitude\AppData\Local\R\win-library\4.3\00LOCK\fastmap\libs\x64\fastmap.dll
## to
## C:\Users\Latitude\AppData\Local\R\win-library\4.3\fastmap\libs\x64\fastmap.dll:
## Permission denied
## Warning: restored 'fastmap'
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
install.packages("skimr")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'skimr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.2
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.3.2
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.2
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
#Load the data set
HotelLisbon_data <- read_csv("F:/2023/Mercer Univesity/MSBA class 2023/2nd semester/BDA 620 Data Mining/Final Project/data collection/Hotel Customers data/Random Selected data/sample1hotel.csv")
## New names:
## • `` -> `...1`
## Rows: 75000 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): Nationality, Age, NameHash, DocIDHash, DistributionChannel, Market...
## dbl (26): ...1, ID, DaysSinceCreation, AverageLeadTime, LodgingRevenue, Othe...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(HotelLisbon_data)
dim(HotelLisbon_data)
## [1] 75000    32

#Let’s first investigate the raw data.

print(colnames(HotelLisbon_data))
##  [1] "...1"                 "ID"                   "Nationality"         
##  [4] "Age"                  "DaysSinceCreation"    "NameHash"            
##  [7] "DocIDHash"            "AverageLeadTime"      "LodgingRevenue"      
## [10] "OtherRevenue"         "BookingsCanceled"     "BookingsNoShowed"    
## [13] "BookingsCheckedIn"    "PersonsNights"        "RoomNights"          
## [16] "DaysSinceLastStay"    "DaysSinceFirstStay"   "DistributionChannel" 
## [19] "MarketSegment"        "SRHighFloor"          "SRLowFloor"          
## [22] "SRAccessibleRoom"     "SRMediumFloor"        "SRBathtub"           
## [25] "SRShower"             "SRCrib"               "SRKingSizeBed"       
## [28] "SRTwinBed"            "SRNearElevator"       "SRAwayFromElevator"  
## [31] "SRNoAlcoholInMiniBar" "SRQuietRoom"
skim(HotelLisbon_data)
Data summary
Name HotelLisbon_data
Number of rows 75000
Number of columns 32
_______________________
Column type frequency:
character 6
numeric 26
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Nationality 0 1 3 3 0 185 0
Age 0 1 1 4 0 105 0
NameHash 0 1 66 66 0 72600 0
DocIDHash 0 1 66 66 0 69343 0
DistributionChannel 0 1 6 23 0 4 0
MarketSegment 0 1 5 21 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
…1 0 1 37500.50 21650.78 1 18750.75 37500.5 56250.25 75000.0 ▇▇▇▇▇
ID 0 1 41823.04 24144.59 1 20884.75 41824.5 62715.25 83590.0 ▇▇▇▇▇
DaysSinceCreation 0 1 453.36 313.53 0 177.00 396.0 723.00 1095.0 ▇▆▅▃▃
AverageLeadTime 0 1 66.21 87.82 -1 0.00 29.0 103.00 588.0 ▇▂▁▁▁
LodgingRevenue 0 1 298.99 374.00 0 59.00 234.0 403.20 21781.0 ▇▁▁▁▁
OtherRevenue 0 1 67.49 110.63 0 2.00 38.5 88.00 5105.5 ▇▁▁▁▁
BookingsCanceled 0 1 0.00 0.07 0 0.00 0.0 0.00 9.0 ▇▁▁▁▁
BookingsNoShowed 0 1 0.00 0.03 0 0.00 0.0 0.00 3.0 ▇▁▁▁▁
BookingsCheckedIn 0 1 0.79 0.65 0 1.00 1.0 1.00 57.0 ▇▁▁▁▁
PersonsNights 0 1 4.65 4.56 0 1.00 4.0 6.00 116.0 ▇▁▁▁▁
RoomNights 0 1 2.36 2.28 0 1.00 2.0 4.00 185.0 ▇▁▁▁▁
DaysSinceLastStay 0 1 400.89 347.27 -1 26.00 366.0 694.00 1104.0 ▇▃▃▃▂
DaysSinceFirstStay 0 1 403.13 348.08 -1 27.00 369.0 698.00 1186.0 ▇▃▃▃▂
SRHighFloor 0 1 0.05 0.21 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
SRLowFloor 0 1 0.00 0.04 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
SRAccessibleRoom 0 1 0.00 0.02 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
SRMediumFloor 0 1 0.00 0.03 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
SRBathtub 0 1 0.00 0.05 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
SRShower 0 1 0.00 0.04 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
SRCrib 0 1 0.01 0.11 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
SRKingSizeBed 0 1 0.35 0.48 0 0.00 0.0 1.00 1.0 ▇▁▁▁▅
SRTwinBed 0 1 0.14 0.35 0 0.00 0.0 0.00 1.0 ▇▁▁▁▂
SRNearElevator 0 1 0.00 0.02 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
SRAwayFromElevator 0 1 0.00 0.06 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
SRNoAlcoholInMiniBar 0 1 0.00 0.01 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
SRQuietRoom 0 1 0.09 0.28 0 0.00 0.0 0.00 1.0 ▇▁▁▁▁
str(HotelLisbon_data)
## spc_tbl_ [75,000 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ...1                : num [1:75000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ ID                  : num [1:75000] 20351 62663 30398 39784 17929 ...
##  $ Nationality         : chr [1:75000] "BRA" "CAN" "PHL" "FRA" ...
##  $ Age                 : chr [1:75000] "85" "30" "70" "31" ...
##  $ DaysSinceCreation   : num [1:75000] 733 178 564 430 785 314 794 237 817 750 ...
##  $ NameHash            : chr [1:75000] "0x0BF2ECC3BF14F7FF3F926275E9BAAFAFF5823E69F81319DCCC5867DC986E10DC" "0xE4899D5F1CF2354CE1EBCD1717CE2EC2D91DE694C9118ADA37CB726A3F43DE22" "0xA1D0401C1635B389B99596B21D6C463B6E63444457B010DDD3D31AC1CE19C2ED" "0x0C948619213E11A1EB2E326CA64277BAFADDDDE51D7D2EB9DA7B71295C8704BC" ...
##  $ DocIDHash           : chr [1:75000] "0x44749B4F7510099B0A4BEF85DE72E75ABD3CC90896949AAC4EF1A46598DCE490" "0x78C451F6556F7129351AE28B3BA7DD499E258DDBEC31F89302AE62899301DB4A" "0x613A9E9859B7CA68E8D3613BE4B2880059B2A5134E2B2B8C85186EAC073A3AC8" "0x56D89BC906AA74D89F63D75436A2BBC0B2DE9EAD2D49CEA970713BD02290AE54" ...
##  $ AverageLeadTime     : num [1:75000] 41 119 94 47 148 0 33 230 213 157 ...
##  $ LodgingRevenue      : num [1:75000] 53 1041 1512 219 269 ...
##  $ OtherRevenue        : num [1:75000] 14 162 72 146 58.5 ...
##  $ BookingsCanceled    : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ BookingsNoShowed    : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ BookingsCheckedIn   : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
##  $ PersonsNights       : num [1:75000] 2 6 8 6 8 0 2 5 8 6 ...
##  $ RoomNights          : num [1:75000] 1 3 4 3 4 0 2 5 4 3 ...
##  $ DaysSinceLastStay   : num [1:75000] 734 181 568 433 789 -1 796 242 821 753 ...
##  $ DaysSinceFirstStay  : num [1:75000] 734 181 568 433 789 -1 796 242 821 753 ...
##  $ DistributionChannel : chr [1:75000] "Travel Agent/Operator" "Travel Agent/Operator" "Travel Agent/Operator" "Travel Agent/Operator" ...
##  $ MarketSegment       : chr [1:75000] "Travel Agent/Operator" "Other" "Other" "Travel Agent/Operator" ...
##  $ SRHighFloor         : num [1:75000] 0 1 0 0 0 0 0 0 0 0 ...
##  $ SRLowFloor          : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SRAccessibleRoom    : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SRMediumFloor       : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SRBathtub           : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SRShower            : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SRCrib              : num [1:75000] 0 0 0 1 0 0 0 0 0 0 ...
##  $ SRKingSizeBed       : num [1:75000] 0 0 0 0 0 1 0 0 1 0 ...
##  $ SRTwinBed           : num [1:75000] 1 0 0 0 0 0 0 0 0 0 ...
##  $ SRNearElevator      : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SRAwayFromElevator  : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SRNoAlcoholInMiniBar: num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ SRQuietRoom         : num [1:75000] 0 0 0 0 0 0 0 0 1 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ...1 = col_double(),
##   ..   ID = col_double(),
##   ..   Nationality = col_character(),
##   ..   Age = col_character(),
##   ..   DaysSinceCreation = col_double(),
##   ..   NameHash = col_character(),
##   ..   DocIDHash = col_character(),
##   ..   AverageLeadTime = col_double(),
##   ..   LodgingRevenue = col_double(),
##   ..   OtherRevenue = col_double(),
##   ..   BookingsCanceled = col_double(),
##   ..   BookingsNoShowed = col_double(),
##   ..   BookingsCheckedIn = col_double(),
##   ..   PersonsNights = col_double(),
##   ..   RoomNights = col_double(),
##   ..   DaysSinceLastStay = col_double(),
##   ..   DaysSinceFirstStay = col_double(),
##   ..   DistributionChannel = col_character(),
##   ..   MarketSegment = col_character(),
##   ..   SRHighFloor = col_double(),
##   ..   SRLowFloor = col_double(),
##   ..   SRAccessibleRoom = col_double(),
##   ..   SRMediumFloor = col_double(),
##   ..   SRBathtub = col_double(),
##   ..   SRShower = col_double(),
##   ..   SRCrib = col_double(),
##   ..   SRKingSizeBed = col_double(),
##   ..   SRTwinBed = col_double(),
##   ..   SRNearElevator = col_double(),
##   ..   SRAwayFromElevator = col_double(),
##   ..   SRNoAlcoholInMiniBar = col_double(),
##   ..   SRQuietRoom = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# HotelLisbon_data is my data frame

# Install and load the knitr package
install.packages("knitr")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'knitr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
library(knitr)
## Warning: package 'knitr' was built under R version 4.3.2
# Create a data frame with variable names and their types
variable_summary <- data.frame(
  Variable = names(HotelLisbon_data),
  Type = sapply(HotelLisbon_data, class)
)

# Use kable to create a simple table
kable(variable_summary, caption = "Variable Summary")
Variable Summary
Variable Type
…1 …1 numeric
ID ID numeric
Nationality Nationality character
Age Age character
DaysSinceCreation DaysSinceCreation numeric
NameHash NameHash character
DocIDHash DocIDHash character
AverageLeadTime AverageLeadTime numeric
LodgingRevenue LodgingRevenue numeric
OtherRevenue OtherRevenue numeric
BookingsCanceled BookingsCanceled numeric
BookingsNoShowed BookingsNoShowed numeric
BookingsCheckedIn BookingsCheckedIn numeric
PersonsNights PersonsNights numeric
RoomNights RoomNights numeric
DaysSinceLastStay DaysSinceLastStay numeric
DaysSinceFirstStay DaysSinceFirstStay numeric
DistributionChannel DistributionChannel character
MarketSegment MarketSegment character
SRHighFloor SRHighFloor numeric
SRLowFloor SRLowFloor numeric
SRAccessibleRoom SRAccessibleRoom numeric
SRMediumFloor SRMediumFloor numeric
SRBathtub SRBathtub numeric
SRShower SRShower numeric
SRCrib SRCrib numeric
SRKingSizeBed SRKingSizeBed numeric
SRTwinBed SRTwinBed numeric
SRNearElevator SRNearElevator numeric
SRAwayFromElevator SRAwayFromElevator numeric
SRNoAlcoholInMiniBar SRNoAlcoholInMiniBar numeric
SRQuietRoom SRQuietRoom numeric

#Let’s see what we can evaluate if we view the summary statistics

summary(HotelLisbon_data)
##       ...1             ID        Nationality            Age           
##  Min.   :    1   Min.   :    1   Length:75000       Length:75000      
##  1st Qu.:18751   1st Qu.:20885   Class :character   Class :character  
##  Median :37501   Median :41825   Mode  :character   Mode  :character  
##  Mean   :37501   Mean   :41823                                        
##  3rd Qu.:56250   3rd Qu.:62715                                        
##  Max.   :75000   Max.   :83590                                        
##  DaysSinceCreation   NameHash          DocIDHash         AverageLeadTime 
##  Min.   :   0.0    Length:75000       Length:75000       Min.   : -1.00  
##  1st Qu.: 177.0    Class :character   Class :character   1st Qu.:  0.00  
##  Median : 396.0    Mode  :character   Mode  :character   Median : 29.00  
##  Mean   : 453.4                                          Mean   : 66.21  
##  3rd Qu.: 723.0                                          3rd Qu.:103.00  
##  Max.   :1095.0                                          Max.   :588.00  
##  LodgingRevenue     OtherRevenue     BookingsCanceled   BookingsNoShowed   
##  Min.   :    0.0   Min.   :   0.00   Min.   :0.000000   Min.   :0.0000000  
##  1st Qu.:   59.0   1st Qu.:   2.00   1st Qu.:0.000000   1st Qu.:0.0000000  
##  Median :  234.0   Median :  38.50   Median :0.000000   Median :0.0000000  
##  Mean   :  299.0   Mean   :  67.49   Mean   :0.001987   Mean   :0.0005867  
##  3rd Qu.:  403.2   3rd Qu.:  88.00   3rd Qu.:0.000000   3rd Qu.:0.0000000  
##  Max.   :21781.0   Max.   :5105.50   Max.   :9.000000   Max.   :3.0000000  
##  BookingsCheckedIn PersonsNights       RoomNights      DaysSinceLastStay
##  Min.   : 0.0000   Min.   :  0.000   Min.   :  0.000   Min.   :  -1.0   
##  1st Qu.: 1.0000   1st Qu.:  1.000   1st Qu.:  1.000   1st Qu.:  26.0   
##  Median : 1.0000   Median :  4.000   Median :  2.000   Median : 366.0   
##  Mean   : 0.7934   Mean   :  4.647   Mean   :  2.358   Mean   : 400.9   
##  3rd Qu.: 1.0000   3rd Qu.:  6.000   3rd Qu.:  4.000   3rd Qu.: 694.0   
##  Max.   :57.0000   Max.   :116.000   Max.   :185.000   Max.   :1104.0   
##  DaysSinceFirstStay DistributionChannel MarketSegment       SRHighFloor     
##  Min.   :  -1.0     Length:75000        Length:75000       Min.   :0.00000  
##  1st Qu.:  27.0     Class :character    Class :character   1st Qu.:0.00000  
##  Median : 369.0     Mode  :character    Mode  :character   Median :0.00000  
##  Mean   : 403.1                                            Mean   :0.04753  
##  3rd Qu.: 698.0                                            3rd Qu.:0.00000  
##  Max.   :1186.0                                            Max.   :1.00000  
##    SRLowFloor       SRAccessibleRoom  SRMediumFloor         SRBathtub      
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.0000000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.0000000   1st Qu.:0.00000  
##  Median :0.000000   Median :0.00000   Median :0.0000000   Median :0.00000  
##  Mean   :0.001373   Mean   :0.00028   Mean   :0.0008933   Mean   :0.00288  
##  3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.0000000   3rd Qu.:0.00000  
##  Max.   :1.000000   Max.   :1.00000   Max.   :1.0000000   Max.   :1.00000  
##     SRShower            SRCrib        SRKingSizeBed      SRTwinBed    
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.000000   Median :0.00000   Median :0.0000   Median :0.000  
##  Mean   :0.001787   Mean   :0.01325   Mean   :0.3522   Mean   :0.143  
##  3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.000  
##  Max.   :1.000000   Max.   :1.00000   Max.   :1.0000   Max.   :1.000  
##  SRNearElevator    SRAwayFromElevator SRNoAlcoholInMiniBar  SRQuietRoom     
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.0000000    Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.0000000    1st Qu.:0.00000  
##  Median :0.00000   Median :0.000000   Median :0.0000000    Median :0.00000  
##  Mean   :0.00036   Mean   :0.003613   Mean   :0.0001333    Mean   :0.08803  
##  3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.0000000    3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.000000   Max.   :1.0000000    Max.   :1.00000
library(knitr)

# Create a summary table
summary_table <- summary(HotelLisbon_data)

# Use kable to create a table
kable(summary_table, format = "markdown", caption = "Summary Statistics")
Summary Statistics
…1 ID Nationality Age DaysSinceCreation NameHash DocIDHash AverageLeadTime LodgingRevenue OtherRevenue BookingsCanceled BookingsNoShowed BookingsCheckedIn PersonsNights RoomNights DaysSinceLastStay DaysSinceFirstStay DistributionChannel MarketSegment SRHighFloor SRLowFloor SRAccessibleRoom SRMediumFloor SRBathtub SRShower SRCrib SRKingSizeBed SRTwinBed SRNearElevator SRAwayFromElevator SRNoAlcoholInMiniBar SRQuietRoom
Min. : 1 Min. : 1 Length:75000 Length:75000 Min. : 0.0 Length:75000 Length:75000 Min. : -1.00 Min. : 0.0 Min. : 0.00 Min. :0.000000 Min. :0.0000000 Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : -1.0 Min. : -1.0 Length:75000 Length:75000 Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.0000000 Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.0000 Min. :0.000 Min. :0.00000 Min. :0.000000 Min. :0.0000000 Min. :0.00000
1st Qu.:18751 1st Qu.:20885 Class :character Class :character 1st Qu.: 177.0 Class :character Class :character 1st Qu.: 0.00 1st Qu.: 59.0 1st Qu.: 2.00 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.: 1.0000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 26.0 1st Qu.: 27.0 Class :character Class :character 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.00000
Median :37501 Median :41825 Mode :character Mode :character Median : 396.0 Mode :character Mode :character Median : 29.00 Median : 234.0 Median : 38.50 Median :0.000000 Median :0.0000000 Median : 1.0000 Median : 4.000 Median : 2.000 Median : 366.0 Median : 369.0 Mode :character Mode :character Median :0.00000 Median :0.000000 Median :0.00000 Median :0.0000000 Median :0.00000 Median :0.000000 Median :0.00000 Median :0.0000 Median :0.000 Median :0.00000 Median :0.000000 Median :0.0000000 Median :0.00000
Mean :37501 Mean :41823 NA NA Mean : 453.4 NA NA Mean : 66.21 Mean : 299.0 Mean : 67.49 Mean :0.001987 Mean :0.0005867 Mean : 0.7934 Mean : 4.647 Mean : 2.358 Mean : 400.9 Mean : 403.1 NA NA Mean :0.04753 Mean :0.001373 Mean :0.00028 Mean :0.0008933 Mean :0.00288 Mean :0.001787 Mean :0.01325 Mean :0.3522 Mean :0.143 Mean :0.00036 Mean :0.003613 Mean :0.0001333 Mean :0.08803
3rd Qu.:56250 3rd Qu.:62715 NA NA 3rd Qu.: 723.0 NA NA 3rd Qu.:103.00 3rd Qu.: 403.2 3rd Qu.: 88.00 3rd Qu.:0.000000 3rd Qu.:0.0000000 3rd Qu.: 1.0000 3rd Qu.: 6.000 3rd Qu.: 4.000 3rd Qu.: 694.0 3rd Qu.: 698.0 NA NA 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.0000000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.0000000 3rd Qu.:0.00000
Max. :75000 Max. :83590 NA NA Max. :1095.0 NA NA Max. :588.00 Max. :21781.0 Max. :5105.50 Max. :9.000000 Max. :3.0000000 Max. :57.0000 Max. :116.000 Max. :185.000 Max. :1104.0 Max. :1186.0 NA NA Max. :1.00000 Max. :1.000000 Max. :1.00000 Max. :1.0000000 Max. :1.00000 Max. :1.000000 Max. :1.00000 Max. :1.0000 Max. :1.000 Max. :1.00000 Max. :1.000000 Max. :1.0000000 Max. :1.00000

Let’s see if we have any missing Values

# Checking for missing values
missing_values <- colSums(is.na(HotelLisbon_data))
missing_values
##                 ...1                   ID          Nationality 
##                    0                    0                    0 
##                  Age    DaysSinceCreation             NameHash 
##                    0                    0                    0 
##            DocIDHash      AverageLeadTime       LodgingRevenue 
##                    0                    0                    0 
##         OtherRevenue     BookingsCanceled     BookingsNoShowed 
##                    0                    0                    0 
##    BookingsCheckedIn        PersonsNights           RoomNights 
##                    0                    0                    0 
##    DaysSinceLastStay   DaysSinceFirstStay  DistributionChannel 
##                    0                    0                    0 
##        MarketSegment          SRHighFloor           SRLowFloor 
##                    0                    0                    0 
##     SRAccessibleRoom        SRMediumFloor            SRBathtub 
##                    0                    0                    0 
##             SRShower               SRCrib        SRKingSizeBed 
##                    0                    0                    0 
##            SRTwinBed       SRNearElevator   SRAwayFromElevator 
##                    0                    0                    0 
## SRNoAlcoholInMiniBar          SRQuietRoom 
##                    0                    0

#seems like there are no missing values.

#At first glance at reading the summary statistics, “Age” is considered as character, which should not be the case. There might be characters in some data.

#Yes, after, checking the data set, there is “Null” values for customers from “Nationality” PRT. WE need to view the whole data

#Let’s investigate all the Null Values if there are any

# Check for null values in the entire dataset
null_values <- sum(is.na(HotelLisbon_data))

null_values_age <- sum(is.na(HotelLisbon_data$Age))


null_values
## [1] 0
null_values_age
## [1] 0

#There seem to be no null values

 # Find rows where Age is NULL
null_age_rows <- HotelLisbon_data$Age == "NULL"

# Create a table of Nationality and count occurrences of NULL in Age
null_age_by_country <- table(HotelLisbon_data$Nationality[null_age_rows])

# Print the result
print(null_age_by_country)
## 
##  ESP  PAN  PRT 
##    1    1 3368

#Only age has a NULL value

#from observaing the data set “NULL” in CAPS is written on age.

#Let’s see which customers have NULL values age.

customers_with_null_age <- HotelLisbon_data %>%
  filter(Age == "NULL") %>%
  select(ID, Nationality, NameHash, DocIDHash)


print(customers_with_null_age)
## # A tibble: 3,370 × 4
##       ID Nationality NameHash                                          DocIDHash
##    <dbl> <chr>       <chr>                                             <chr>    
##  1 45142 PRT         0x76DC0F0F62F831421F112A43C0E872118157C520FBE246… 0x5FA1E0…
##  2 35353 PRT         0x03E34CF88B9A651D29C435163112746CB79CD68D7B3FC0… 0x5FA1E0…
##  3 28863 PRT         0xCF7C76165178E1DB83FA1ED2B2BECCA075D8AAC8104799… 0x5FA1E0…
##  4 36797 PRT         0xDC80CCCAFE75A6647DA264B3AB63DF7BBE00B810E99350… 0x5FA1E0…
##  5 10793 PRT         0x01FBEA469D671F06FF4954F108BE7BC6D56A222F429815… 0x5FA1E0…
##  6  6346 PRT         0xDA7528770C8A14A3026F73F0B61C983F4EE721B7992BEE… 0x5FA1E0…
##  7  3245 PRT         0xE8E01F1987C2E0ADB97CEA6FA23FAD54308C4FC3D518FE… 0x5FA1E0…
##  8  2199 PRT         0x12DBDE199AE9016BC597665C6429B4980CAE2E6B51F53D… 0x5FA1E0…
##  9 70109 PRT         0x09D3589CED237C1402EAB5B7ADEA72775360592D6F1B78… 0x5FA1E0…
## 10 64592 PRT         0x51546E434D6D257A120E19AA3310943085A22DE2248F58… 0x5FA1E0…
## # ℹ 3,360 more rows

#There are many Nationality PRT with NULL age value and #found PAN, ESP and PRT in Nationality as NULL too #we need to make Age into a Numerical/Quantitative variable.

#Let’s check for negative, NA, NULL values in variables

# Number of negative observations in each variable
negative_values <- sapply(HotelLisbon_data, function(x) sum(x < 0, na.rm = TRUE))
print(negative_values)
##                 ...1                   ID          Nationality 
##                    0                    0                    0 
##                  Age    DaysSinceCreation             NameHash 
##                   15                    0                    0 
##            DocIDHash      AverageLeadTime       LodgingRevenue 
##                    0                   10                    0 
##         OtherRevenue     BookingsCanceled     BookingsNoShowed 
##                    0                    0                    0 
##    BookingsCheckedIn        PersonsNights           RoomNights 
##                    0                    0                    0 
##    DaysSinceLastStay   DaysSinceFirstStay  DistributionChannel 
##                17886                17886                    0 
##        MarketSegment          SRHighFloor           SRLowFloor 
##                    0                    0                    0 
##     SRAccessibleRoom        SRMediumFloor            SRBathtub 
##                    0                    0                    0 
##             SRShower               SRCrib        SRKingSizeBed 
##                    0                    0                    0 
##            SRTwinBed       SRNearElevator   SRAwayFromElevator 
##                    0                    0                    0 
## SRNoAlcoholInMiniBar          SRQuietRoom 
##                    0                    0
# Number of NA observations in each variable
na_values <- sapply(HotelLisbon_data, function(x) sum(is.na(x)))
print(na_values)
##                 ...1                   ID          Nationality 
##                    0                    0                    0 
##                  Age    DaysSinceCreation             NameHash 
##                    0                    0                    0 
##            DocIDHash      AverageLeadTime       LodgingRevenue 
##                    0                    0                    0 
##         OtherRevenue     BookingsCanceled     BookingsNoShowed 
##                    0                    0                    0 
##    BookingsCheckedIn        PersonsNights           RoomNights 
##                    0                    0                    0 
##    DaysSinceLastStay   DaysSinceFirstStay  DistributionChannel 
##                    0                    0                    0 
##        MarketSegment          SRHighFloor           SRLowFloor 
##                    0                    0                    0 
##     SRAccessibleRoom        SRMediumFloor            SRBathtub 
##                    0                    0                    0 
##             SRShower               SRCrib        SRKingSizeBed 
##                    0                    0                    0 
##            SRTwinBed       SRNearElevator   SRAwayFromElevator 
##                    0                    0                    0 
## SRNoAlcoholInMiniBar          SRQuietRoom 
##                    0                    0
# Number of NULL observations in each variable (assuming NULL is written as "NULL" in character columns)
null_values_char <- sapply(HotelLisbon_data, function(x) sum(x == "NULL"))
print(null_values_char)
##                 ...1                   ID          Nationality 
##                    0                    0                    0 
##                  Age    DaysSinceCreation             NameHash 
##                 3370                    0                    0 
##            DocIDHash      AverageLeadTime       LodgingRevenue 
##                    0                    0                    0 
##         OtherRevenue     BookingsCanceled     BookingsNoShowed 
##                    0                    0                    0 
##    BookingsCheckedIn        PersonsNights           RoomNights 
##                    0                    0                    0 
##    DaysSinceLastStay   DaysSinceFirstStay  DistributionChannel 
##                    0                    0                    0 
##        MarketSegment          SRHighFloor           SRLowFloor 
##                    0                    0                    0 
##     SRAccessibleRoom        SRMediumFloor            SRBathtub 
##                    0                    0                    0 
##             SRShower               SRCrib        SRKingSizeBed 
##                    0                    0                    0 
##            SRTwinBed       SRNearElevator   SRAwayFromElevator 
##                    0                    0                    0 
## SRNoAlcoholInMiniBar          SRQuietRoom 
##                    0                    0
# Number of NULL observations in each variable (assuming NULL is written as NA in numeric columns)
null_values_na <- sapply(HotelLisbon_data, function(x) sum(is.na(x)))
print(null_values_na)
##                 ...1                   ID          Nationality 
##                    0                    0                    0 
##                  Age    DaysSinceCreation             NameHash 
##                    0                    0                    0 
##            DocIDHash      AverageLeadTime       LodgingRevenue 
##                    0                    0                    0 
##         OtherRevenue     BookingsCanceled     BookingsNoShowed 
##                    0                    0                    0 
##    BookingsCheckedIn        PersonsNights           RoomNights 
##                    0                    0                    0 
##    DaysSinceLastStay   DaysSinceFirstStay  DistributionChannel 
##                    0                    0                    0 
##        MarketSegment          SRHighFloor           SRLowFloor 
##                    0                    0                    0 
##     SRAccessibleRoom        SRMediumFloor            SRBathtub 
##                    0                    0                    0 
##             SRShower               SRCrib        SRKingSizeBed 
##                    0                    0                    0 
##            SRTwinBed       SRNearElevator   SRAwayFromElevator 
##                    0                    0                    0 
## SRNoAlcoholInMiniBar          SRQuietRoom 
##                    0                    0

#Let’s specify which Variables and how many observations haev NULL, NA or negative values

# Create a data frame to store the counts
observations_summary <- data.frame(
  Variable = names(HotelLisbon_data),
  Negative = sapply(HotelLisbon_data, function(x) sum(x < 0, na.rm = TRUE)),
  NA_Count = sapply(HotelLisbon_data, function(x) sum(is.na(x))),
  NULL_Count = sapply(HotelLisbon_data, function(x) sum(x == "NULL"))
)

# Print the summary
print(observations_summary)
##                                  Variable Negative NA_Count NULL_Count
## ...1                                 ...1        0        0          0
## ID                                     ID        0        0          0
## Nationality                   Nationality        0        0          0
## Age                                   Age       15        0       3370
## DaysSinceCreation       DaysSinceCreation        0        0          0
## NameHash                         NameHash        0        0          0
## DocIDHash                       DocIDHash        0        0          0
## AverageLeadTime           AverageLeadTime       10        0          0
## LodgingRevenue             LodgingRevenue        0        0          0
## OtherRevenue                 OtherRevenue        0        0          0
## BookingsCanceled         BookingsCanceled        0        0          0
## BookingsNoShowed         BookingsNoShowed        0        0          0
## BookingsCheckedIn       BookingsCheckedIn        0        0          0
## PersonsNights               PersonsNights        0        0          0
## RoomNights                     RoomNights        0        0          0
## DaysSinceLastStay       DaysSinceLastStay    17886        0          0
## DaysSinceFirstStay     DaysSinceFirstStay    17886        0          0
## DistributionChannel   DistributionChannel        0        0          0
## MarketSegment               MarketSegment        0        0          0
## SRHighFloor                   SRHighFloor        0        0          0
## SRLowFloor                     SRLowFloor        0        0          0
## SRAccessibleRoom         SRAccessibleRoom        0        0          0
## SRMediumFloor               SRMediumFloor        0        0          0
## SRBathtub                       SRBathtub        0        0          0
## SRShower                         SRShower        0        0          0
## SRCrib                             SRCrib        0        0          0
## SRKingSizeBed               SRKingSizeBed        0        0          0
## SRTwinBed                       SRTwinBed        0        0          0
## SRNearElevator             SRNearElevator        0        0          0
## SRAwayFromElevator     SRAwayFromElevator        0        0          0
## SRNoAlcoholInMiniBar SRNoAlcoholInMiniBar        0        0          0
## SRQuietRoom                   SRQuietRoom        0        0          0

#Variables with Negative Values: #Age: 15 negative values #AverageLeadTime: 10 negative values #DaysSinceLastStay: 17,886 negative values #DaysSinceFirstStay: 17,886 negative values

#Variables with NA Values: #No variable has NA values. #Variables with “NULL” Values: #Age: 3,370 “NULL” values

#Let’s first investigate Unique values in our data set

# Function to get limited unique values for each variable
get_limited_unique_values <- function(data, limit = 20) {
  limited_unique_values <- lapply(data, function(x) unique(x)[1:min(length(unique(x)), limit)])
  return(limited_unique_values)
}

# Set a limit for the number of unique values to display
display_limit <- 20

# Get limited unique values for each variable in HotelLisbon_data
limited_unique_values_list <- get_limited_unique_values(HotelLisbon_data, display_limit)

# Print limited unique values
for (i in seq_along(limited_unique_values_list)) {
  cat("Variable:", names(limited_unique_values_list)[i], "\n")
  print(limited_unique_values_list[[i]])
  cat("\n")
}
## Variable: ...1 
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
## 
## Variable: ID 
##  [1] 20351 62663 30398 39784 17929 46541 17258 55277 15704 19803 53746  4649
## [13] 10264  2629 23494 35607 40960 18689 36582 42337
## 
## Variable: Nationality 
##  [1] "BRA" "CAN" "PHL" "FRA" "HUN" "ITA" "EST" "BEL" "PRT" "GBR" "USA" "COL"
## [13] "CHN" "IRL" "DEU" "CHE" "AUS" "ESP" "BIH" "ISR"
## 
## Variable: Age 
##  [1] "85" "30" "70" "31" "29" "14" "49" "55" "53" "47" "44" "65" "62" "38" "20"
## [16] "41" "19" "69" "28" "40"
## 
## Variable: DaysSinceCreation 
##  [1]  733  178  564  430  785  314  794  237  817  750  249  990  897 1027  667
## [16]  486  411  771  472  386
## 
## Variable: NameHash 
##  [1] "0x0BF2ECC3BF14F7FF3F926275E9BAAFAFF5823E69F81319DCCC5867DC986E10DC"
##  [2] "0xE4899D5F1CF2354CE1EBCD1717CE2EC2D91DE694C9118ADA37CB726A3F43DE22"
##  [3] "0xA1D0401C1635B389B99596B21D6C463B6E63444457B010DDD3D31AC1CE19C2ED"
##  [4] "0x0C948619213E11A1EB2E326CA64277BAFADDDDE51D7D2EB9DA7B71295C8704BC"
##  [5] "0xCB5BBB03BC55C10987902657B83708EFF49192751D43942E784EA48C92E0F6B0"
##  [6] "0x5199CD2B356E8BD8D84D9CDDFEBF69771B04D65D7575C5AAAB5055ACA261EF90"
##  [7] "0xC4CDDFC6B120BBAAC63F4FFD1C39B5BFBC03C0992973321EA972AA8A714C4E36"
##  [8] "0x5D1CB184D92EE7F095CA22D47D1D3F867A77C0841F8C94CA2C1F4944803E47A1"
##  [9] "0x9F4162C841944D33574F243D2256001F1018A4F3DA5D8DF115DB2CB78BD30EEB"
## [10] "0x28E9485A3E06E2C0039D225160A87EF025D36489847555BAD0D610746B28F05D"
## [11] "0x3614128425809C4CD1438137D37BE65D0B422084B230DCFD5D4544668F379825"
## [12] "0xF7B61527AD41493413A8138C38F391BEE177F12CCF52D2AEED5652A8BD92C132"
## [13] "0xEC72C7BAE4558A9270FF8FDBCEC759B343C6AEF79F37B2DA464B759240124B2E"
## [14] "0x59CB21FECE9AD0AEDF436B5F745AFD4DFEF3F8EB010593FA89F36BAE87AD4A15"
## [15] "0x7A56EB69D6A8EB1CBDD32AF9285DE543353EE1258CE09E7CA3C9EA761B96A446"
## [16] "0x0CB7D63FEB45A9AC3641E5974598C3DBF713B42382C3E036F119972E4CECCAE9"
## [17] "0x1E3A0E71BAC664E904D7C2F145DF260C5973D26DD460A290C2D33CE02AD2CF98"
## [18] "0x6F820AF2D725F3CDA8C315494B6F01B69EF1AF35FD337185DE1A37065B28B13E"
## [19] "0xCE236402009BD6A678F698D67EC43B1AA38C1BC4207DCEA84D8128745432FD04"
## [20] "0xA002C5DE86431272037EEA01E58CD2D09359CB905C2578C64913E8BB6C371B64"
## 
## Variable: DocIDHash 
##  [1] "0x44749B4F7510099B0A4BEF85DE72E75ABD3CC90896949AAC4EF1A46598DCE490"
##  [2] "0x78C451F6556F7129351AE28B3BA7DD499E258DDBEC31F89302AE62899301DB4A"
##  [3] "0x613A9E9859B7CA68E8D3613BE4B2880059B2A5134E2B2B8C85186EAC073A3AC8"
##  [4] "0x56D89BC906AA74D89F63D75436A2BBC0B2DE9EAD2D49CEA970713BD02290AE54"
##  [5] "0xC510E8C80CF916E3A3F30A5F04DC3A8022F4F36693DF5BFED282F9575C55EC12"
##  [6] "0x639BC1149F7DCEE41997906F58BF2032CE69AD41FA5404CBB9117571DEA7FF7B"
##  [7] "0xBA6D5D11497ECF19C7A0295CB3BE0CEFE7649391F35FB1768C0409A5152DBB29"
##  [8] "0x547AF4CEFB803EBB128C5E72B279F8E6F9ABD1D9F39804C125F1D4E75D112EBC"
##  [9] "0xD53AAB77FC4500611E7210C0374ACE1EEEE0CDA884579764DE5FD8E3BF92767D"
## [10] "0xC15D6AF40193DAFB50FF3081F9172A886579F7A6F05E0A3041B5A1EA11E1074A"
## [11] "0xD68ADFC5BA8A13D376A57B9E0DEB2EB116235B749C6A60526EE8B9F20C2F80F5"
## [12] "0x416BDDF55E00D3DAFCF409BB2F6BFBB0EC2485746C1613D3C669E17A05C46E70"
## [13] "0xF7F8F90C807157F8DC908E3C755C8D41FB285DC8948968B982A99D81A1474276"
## [14] "0xA680C8A830675C0C29723634E1377786419648518D4673245679A9A152683A12"
## [15] "0x86F490B44CD068C0A09226E89AEA8A59BC9E5E3EAF20A6BBBA5EA7DB294C0A60"
## [16] "0xA67964202574DC2E30DF14A838D26B4982DBA18E7A3BF0E4A8E87F47CC701F3A"
## [17] "0x6EF04A842E6A189C2FB24BD3EC1733D848D190B2930E64E2BDE0FA96371EC7C7"
## [18] "0x44076C065C679BEB6426C095EA07613BDCB3EE2AD0F8355DBFBD515269C7BBAD"
## [19] "0xD5C981DA96B7B19EBED0A9F7F1DF7EBFE528797CB9B901CE3D46F7CC592A4F75"
## [20] "0x60F12E40D2C94F56652C0CA04BC56C0881734B3F56D3978A812778A6E4C6993E"
## 
## Variable: AverageLeadTime 
##  [1]  41 119  94  47 148   0  33 230 213 157   1  16  71 167   6  91   7  24  84
## [20]   2
## 
## Variable: LodgingRevenue 
##  [1]   53.00 1041.00 1512.00  219.00  268.80    0.00  218.00  870.00 1090.20
## [10]  168.00  368.00  281.47  270.24  448.80  225.00  467.95  536.00  126.00
## [19]  606.00  126.50
## 
## Variable: OtherRevenue 
##  [1]  14.00 162.00  72.00 146.00  58.50   0.00  22.00  35.00 271.50  44.50
## [11]  61.75  94.50 226.00  77.50  69.00 142.00  36.00 118.00 167.00  70.00
## 
## Variable: BookingsCanceled 
## [1] 0 2 1 4 3 9
## 
## Variable: BookingsNoShowed 
## [1] 0 1 2 3
## 
## Variable: BookingsCheckedIn 
##  [1]  1  0  2  3  8  4  7 12 10  5 34  6 13  9 11 29 14 57 19 15
## 
## Variable: PersonsNights 
##  [1]  2  6  8  0  5 12  4  3 15 10  1  9 14 18 22 24  7 16 13 20
## 
## Variable: RoomNights 
##  [1]  1  3  4  0  2  5  6  7 11  8 14 12  9 15 19 13 10 20 29 16
## 
## Variable: DaysSinceLastStay 
##  [1] 734 181 568 433 789  -1 796 242 821 753 253 993 901 673 488 415 773 476 388
## [20] 187
## 
## Variable: DaysSinceFirstStay 
##  [1] 734 181 568 433 789  -1 796 242 821 753 253 993 901 673 488 415 773 476 388
## [20] 190
## 
## Variable: DistributionChannel 
## [1] "Travel Agent/Operator"   "Direct"                 
## [3] "Corporate"               "Electronic Distribution"
## 
## Variable: MarketSegment 
## [1] "Travel Agent/Operator" "Other"                 "Direct"               
## [4] "Groups"                "Aviation"              "Complementary"        
## [7] "Corporate"            
## 
## Variable: SRHighFloor 
## [1] 0 1
## 
## Variable: SRLowFloor 
## [1] 0 1
## 
## Variable: SRAccessibleRoom 
## [1] 0 1
## 
## Variable: SRMediumFloor 
## [1] 0 1
## 
## Variable: SRBathtub 
## [1] 0 1
## 
## Variable: SRShower 
## [1] 0 1
## 
## Variable: SRCrib 
## [1] 0 1
## 
## Variable: SRKingSizeBed 
## [1] 0 1
## 
## Variable: SRTwinBed 
## [1] 1 0
## 
## Variable: SRNearElevator 
## [1] 0 1
## 
## Variable: SRAwayFromElevator 
## [1] 0 1
## 
## Variable: SRNoAlcoholInMiniBar 
## [1] 0 1
## 
## Variable: SRQuietRoom 
## [1] 0 1

#Let’s fix Age variable first

# Convert Age to numeric
HotelLisbon_data$Age <- as.numeric(as.character(HotelLisbon_data$Age))
## Warning: NAs introduced by coercion
# Replace missing values with median
median_age <- median(HotelLisbon_data$Age, na.rm = TRUE)
HotelLisbon_data$Age[is.na(HotelLisbon_data$Age)] <- median_age

# Print the median age
cat("Median Age:", median_age, "\n")
## Median Age: 46
# Check for missing values after replacement
sum(is.na(HotelLisbon_data$Age))
## [1] 0

#Age: 15 negative values #Let’s find out which ones these are and why

# Identify rows with negative Age values
negative_age_rows <- HotelLisbon_data$Age < 0

# Display the rows with negative Age values
rows_with_negative_age <- HotelLisbon_data[negative_age_rows, ]

# Print the result
print(rows_with_negative_age)
## # A tibble: 15 × 32
##     ...1    ID Nationality   Age DaysSinceCreation NameHash            DocIDHash
##    <dbl> <dbl> <chr>       <dbl>             <dbl> <chr>               <chr>    
##  1  2449  8361 DEU            -7               927 0x72A31262A688A38B… 0x539E60…
##  2  8095  6752 GBR           -11               953 0xDB672419DADD95C4… 0xB8599D…
##  3 16751 14688 PYF            -1               832 0xF02B0B5CAD9B762F… 0x7F8199…
##  4 19578  8629 NLD           -10               923 0xF5C3782374B119A5… 0xB0B8AE…
##  5 22127  2054 BIH            -7              1034 0x816353557EFCF6A1… 0xE2B44F…
##  6 24083 57343 PRT            -6               220 0xE327757DEF4F79D6… 0x6EB53E…
##  7 26007  5129 FRA            -9               981 0x6471098D9873D3D9… 0x28EE0D…
##  8 31440 20406 USA            -9               732 0x4464A2E5E3D712E9… 0xBC3566…
##  9 35378 15105 DEU           -11               827 0x8338DF4BF7DFD49E… 0x8B1995…
## 10 36493 16975 FRA            -1               799 0xC2DC5508F5A0CF5C… 0x995201…
## 11 44102 20329 DZA            -1               733 0x599FC8CB50ED9179… 0x731067…
## 12 44118 14322 PRT           -10               837 0x5202CE5913A8D676… 0x2BAA6D…
## 13 49911 10663 DEU           -10               891 0x77AF5AA2214FAAC1… 0x4AFD62…
## 14 57657  9920 DZA            -6               903 0x8106B7FC258F0757… 0xCACE52…
## 15 73661 10756 PRT            -6               891 0x6EC142C9678B27CE… 0x3DD8D4…
## # ℹ 25 more variables: AverageLeadTime <dbl>, LodgingRevenue <dbl>,
## #   OtherRevenue <dbl>, BookingsCanceled <dbl>, BookingsNoShowed <dbl>,
## #   BookingsCheckedIn <dbl>, PersonsNights <dbl>, RoomNights <dbl>,
## #   DaysSinceLastStay <dbl>, DaysSinceFirstStay <dbl>,
## #   DistributionChannel <chr>, MarketSegment <chr>, SRHighFloor <dbl>,
## #   SRLowFloor <dbl>, SRAccessibleRoom <dbl>, SRMediumFloor <dbl>,
## #   SRBathtub <dbl>, SRShower <dbl>, SRCrib <dbl>, SRKingSizeBed <dbl>, …

#there seems to be inaccuarcy in age as well. There are negative values in ages and there are single digit ages, which can’t be right as a Hotel you cannot entertain children.

# Display the class of each variable
sapply(HotelLisbon_data, class)
##                 ...1                   ID          Nationality 
##            "numeric"            "numeric"          "character" 
##                  Age    DaysSinceCreation             NameHash 
##            "numeric"            "numeric"          "character" 
##            DocIDHash      AverageLeadTime       LodgingRevenue 
##          "character"            "numeric"            "numeric" 
##         OtherRevenue     BookingsCanceled     BookingsNoShowed 
##            "numeric"            "numeric"            "numeric" 
##    BookingsCheckedIn        PersonsNights           RoomNights 
##            "numeric"            "numeric"            "numeric" 
##    DaysSinceLastStay   DaysSinceFirstStay  DistributionChannel 
##            "numeric"            "numeric"          "character" 
##        MarketSegment          SRHighFloor           SRLowFloor 
##          "character"            "numeric"            "numeric" 
##     SRAccessibleRoom        SRMediumFloor            SRBathtub 
##            "numeric"            "numeric"            "numeric" 
##             SRShower               SRCrib        SRKingSizeBed 
##            "numeric"            "numeric"            "numeric" 
##            SRTwinBed       SRNearElevator   SRAwayFromElevator 
##            "numeric"            "numeric"            "numeric" 
## SRNoAlcoholInMiniBar          SRQuietRoom 
##            "numeric"            "numeric"

#let’s now change certain character variables into factors and Binary variables into factors

# Convert character variables to factor
HotelLisbon_data$Nationality <- as.factor(HotelLisbon_data$Nationality)
HotelLisbon_data$DistributionChannel <- as.factor(HotelLisbon_data$DistributionChannel)
HotelLisbon_data$MarketSegment <- as.factor(HotelLisbon_data$MarketSegment)

# Convert binary variables to factor
binary_vars <- c(
                 "SRHighFloor", "SRLowFloor", "SRAccessibleRoom", 
                 "SRMediumFloor", "SRBathtub", "SRShower", "SRCrib", 
                 "SRKingSizeBed", "SRTwinBed", "SRNearElevator", 
                 "SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")

HotelLisbon_data[binary_vars] <- lapply(HotelLisbon_data[binary_vars], as.factor)

# Convert BookingsCheckedIn to numeric
HotelLisbon_data$BookingsCheckedIn <- as.numeric(HotelLisbon_data$BookingsCheckedIn)
HotelLisbon_data$BookingsNoShowed <- as.numeric(HotelLisbon_data$BookingsNoShowed)
HotelLisbon_data$BookingsCanceled <- as.numeric(HotelLisbon_data$BookingsCanceled)

# Verify changes
sapply(HotelLisbon_data, class)
##                 ...1                   ID          Nationality 
##            "numeric"            "numeric"             "factor" 
##                  Age    DaysSinceCreation             NameHash 
##            "numeric"            "numeric"          "character" 
##            DocIDHash      AverageLeadTime       LodgingRevenue 
##          "character"            "numeric"            "numeric" 
##         OtherRevenue     BookingsCanceled     BookingsNoShowed 
##            "numeric"            "numeric"            "numeric" 
##    BookingsCheckedIn        PersonsNights           RoomNights 
##            "numeric"            "numeric"            "numeric" 
##    DaysSinceLastStay   DaysSinceFirstStay  DistributionChannel 
##            "numeric"            "numeric"             "factor" 
##        MarketSegment          SRHighFloor           SRLowFloor 
##             "factor"             "factor"             "factor" 
##     SRAccessibleRoom        SRMediumFloor            SRBathtub 
##             "factor"             "factor"             "factor" 
##             SRShower               SRCrib        SRKingSizeBed 
##             "factor"             "factor"             "factor" 
##            SRTwinBed       SRNearElevator   SRAwayFromElevator 
##             "factor"             "factor"             "factor" 
## SRNoAlcoholInMiniBar          SRQuietRoom 
##             "factor"             "factor"
str(HotelLisbon_data)
## spc_tbl_ [75,000 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ...1                : num [1:75000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ ID                  : num [1:75000] 20351 62663 30398 39784 17929 ...
##  $ Nationality         : Factor w/ 185 levels "ABW","AGO","AIA",..: 25 29 132 56 73 56 82 52 16 82 ...
##  $ Age                 : num [1:75000] 85 30 70 31 29 14 49 55 53 47 ...
##  $ DaysSinceCreation   : num [1:75000] 733 178 564 430 785 314 794 237 817 750 ...
##  $ NameHash            : chr [1:75000] "0x0BF2ECC3BF14F7FF3F926275E9BAAFAFF5823E69F81319DCCC5867DC986E10DC" "0xE4899D5F1CF2354CE1EBCD1717CE2EC2D91DE694C9118ADA37CB726A3F43DE22" "0xA1D0401C1635B389B99596B21D6C463B6E63444457B010DDD3D31AC1CE19C2ED" "0x0C948619213E11A1EB2E326CA64277BAFADDDDE51D7D2EB9DA7B71295C8704BC" ...
##  $ DocIDHash           : chr [1:75000] "0x44749B4F7510099B0A4BEF85DE72E75ABD3CC90896949AAC4EF1A46598DCE490" "0x78C451F6556F7129351AE28B3BA7DD499E258DDBEC31F89302AE62899301DB4A" "0x613A9E9859B7CA68E8D3613BE4B2880059B2A5134E2B2B8C85186EAC073A3AC8" "0x56D89BC906AA74D89F63D75436A2BBC0B2DE9EAD2D49CEA970713BD02290AE54" ...
##  $ AverageLeadTime     : num [1:75000] 41 119 94 47 148 0 33 230 213 157 ...
##  $ LodgingRevenue      : num [1:75000] 53 1041 1512 219 269 ...
##  $ OtherRevenue        : num [1:75000] 14 162 72 146 58.5 ...
##  $ BookingsCanceled    : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ BookingsNoShowed    : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ BookingsCheckedIn   : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
##  $ PersonsNights       : num [1:75000] 2 6 8 6 8 0 2 5 8 6 ...
##  $ RoomNights          : num [1:75000] 1 3 4 3 4 0 2 5 4 3 ...
##  $ DaysSinceLastStay   : num [1:75000] 734 181 568 433 789 -1 796 242 821 753 ...
##  $ DaysSinceFirstStay  : num [1:75000] 734 181 568 433 789 -1 796 242 821 753 ...
##  $ DistributionChannel : Factor w/ 4 levels "Corporate","Direct",..: 4 4 4 4 2 4 2 4 4 4 ...
##  $ MarketSegment       : Factor w/ 7 levels "Aviation","Complementary",..: 7 6 6 7 4 6 4 5 6 7 ...
##  $ SRHighFloor         : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ SRLowFloor          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRAccessibleRoom    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRMediumFloor       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRBathtub           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRShower            : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRCrib              : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
##  $ SRKingSizeBed       : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 2 1 ...
##  $ SRTwinBed           : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
##  $ SRNearElevator      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRAwayFromElevator  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRNoAlcoholInMiniBar: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRQuietRoom         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ...1 = col_double(),
##   ..   ID = col_double(),
##   ..   Nationality = col_character(),
##   ..   Age = col_character(),
##   ..   DaysSinceCreation = col_double(),
##   ..   NameHash = col_character(),
##   ..   DocIDHash = col_character(),
##   ..   AverageLeadTime = col_double(),
##   ..   LodgingRevenue = col_double(),
##   ..   OtherRevenue = col_double(),
##   ..   BookingsCanceled = col_double(),
##   ..   BookingsNoShowed = col_double(),
##   ..   BookingsCheckedIn = col_double(),
##   ..   PersonsNights = col_double(),
##   ..   RoomNights = col_double(),
##   ..   DaysSinceLastStay = col_double(),
##   ..   DaysSinceFirstStay = col_double(),
##   ..   DistributionChannel = col_character(),
##   ..   MarketSegment = col_character(),
##   ..   SRHighFloor = col_double(),
##   ..   SRLowFloor = col_double(),
##   ..   SRAccessibleRoom = col_double(),
##   ..   SRMediumFloor = col_double(),
##   ..   SRBathtub = col_double(),
##   ..   SRShower = col_double(),
##   ..   SRCrib = col_double(),
##   ..   SRKingSizeBed = col_double(),
##   ..   SRTwinBed = col_double(),
##   ..   SRNearElevator = col_double(),
##   ..   SRAwayFromElevator = col_double(),
##   ..   SRNoAlcoholInMiniBar = col_double(),
##   ..   SRQuietRoom = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(HotelLisbon_data)
##       ...1             ID         Nationality         Age       
##  Min.   :    1   Min.   :    1   FRA    :11170   Min.   :-11.0  
##  1st Qu.:18751   1st Qu.:20885   PRT    :10365   1st Qu.: 34.0  
##  Median :37501   Median :41825   DEU    : 9242   Median : 46.0  
##  Mean   :37501   Mean   :41823   GBR    : 7750   Mean   : 45.4  
##  3rd Qu.:56250   3rd Qu.:62715   ESP    : 4399   3rd Qu.: 56.0  
##  Max.   :75000   Max.   :83590   USA    : 3076   Max.   :114.0  
##                                  (Other):28998                  
##  DaysSinceCreation   NameHash          DocIDHash         AverageLeadTime 
##  Min.   :   0.0    Length:75000       Length:75000       Min.   : -1.00  
##  1st Qu.: 177.0    Class :character   Class :character   1st Qu.:  0.00  
##  Median : 396.0    Mode  :character   Mode  :character   Median : 29.00  
##  Mean   : 453.4                                          Mean   : 66.21  
##  3rd Qu.: 723.0                                          3rd Qu.:103.00  
##  Max.   :1095.0                                          Max.   :588.00  
##                                                                          
##  LodgingRevenue     OtherRevenue     BookingsCanceled   BookingsNoShowed   
##  Min.   :    0.0   Min.   :   0.00   Min.   :0.000000   Min.   :0.0000000  
##  1st Qu.:   59.0   1st Qu.:   2.00   1st Qu.:0.000000   1st Qu.:0.0000000  
##  Median :  234.0   Median :  38.50   Median :0.000000   Median :0.0000000  
##  Mean   :  299.0   Mean   :  67.49   Mean   :0.001987   Mean   :0.0005867  
##  3rd Qu.:  403.2   3rd Qu.:  88.00   3rd Qu.:0.000000   3rd Qu.:0.0000000  
##  Max.   :21781.0   Max.   :5105.50   Max.   :9.000000   Max.   :3.0000000  
##                                                                            
##  BookingsCheckedIn PersonsNights       RoomNights      DaysSinceLastStay
##  Min.   : 0.0000   Min.   :  0.000   Min.   :  0.000   Min.   :  -1.0   
##  1st Qu.: 1.0000   1st Qu.:  1.000   1st Qu.:  1.000   1st Qu.:  26.0   
##  Median : 1.0000   Median :  4.000   Median :  2.000   Median : 366.0   
##  Mean   : 0.7934   Mean   :  4.647   Mean   :  2.358   Mean   : 400.9   
##  3rd Qu.: 1.0000   3rd Qu.:  6.000   3rd Qu.:  4.000   3rd Qu.: 694.0   
##  Max.   :57.0000   Max.   :116.000   Max.   :185.000   Max.   :1104.0   
##                                                                         
##  DaysSinceFirstStay              DistributionChannel
##  Min.   :  -1.0     Corporate              : 2340   
##  1st Qu.:  27.0     Direct                 :10714   
##  Median : 369.0     Electronic Distribution:  456   
##  Mean   : 403.1     Travel Agent/Operator  :61490   
##  3rd Qu.: 698.0                                     
##  Max.   :1186.0                                     
##                                                     
##                MarketSegment   SRHighFloor SRLowFloor SRAccessibleRoom
##  Aviation             :  221   0:71435     0:74897    0:74979         
##  Complementary        :  453   1: 3565     1:  103    1:   21         
##  Corporate            : 1939                                          
##  Direct               :10312                                          
##  Groups               : 8519                                          
##  Other                :43046                                          
##  Travel Agent/Operator:10510                                          
##  SRMediumFloor SRBathtub SRShower  SRCrib    SRKingSizeBed SRTwinBed
##  0:74933       0:74784   0:74866   0:74006   0:48583       0:64277  
##  1:   67       1:  216   1:  134   1:  994   1:26417       1:10723  
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##  SRNearElevator SRAwayFromElevator SRNoAlcoholInMiniBar SRQuietRoom
##  0:74973        0:74729            0:74990              0:68398    
##  1:   27        1:  271            1:   10              1: 6602    
##                                                                    
##                                                                    
##                                                                    
##                                                                    
## 

#Let’s create some Bar Charts and Histograms for further investigation

histogram_cols <- c("Age", "DaysSinceCreation", "AverageLeadTime", "LodgingRevenue", "OtherRevenue", 
                    "BookingsCanceled", "BookingsNoShowed", "BookingsCheckedIn", "PersonsNights", "RoomNights", 
                    "DaysSinceLastStay", "DaysSinceFirstStay", "Nationality", "DistributionChannel", 
                    "MarketSegment", "SRHighFloor", "SRLowFloor", "SRAccessibleRoom", "SRMediumFloor", 
                    "SRBathtub", "SRShower", "SRCrib", "SRKingSizeBed", "SRTwinBed", "SRNearElevator", 
                    "SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")

for (col in histogram_cols) {
 
  if (is.numeric(HotelLisbon_data[[col]])) {
    plot <- ggplot(HotelLisbon_data, aes_string(x = col)) +
      geom_histogram(bins = 30, fill = 'blue', color = 'black') +
      labs(title = paste("Histogram of", col), x = col, y = "Frequency") +
      theme_minimal()
  } else {
   
    plot <- ggplot(HotelLisbon_data, aes_string(x = col)) +
      geom_bar(fill = 'blue', color = 'black') +
      labs(title = paste("Bar Chart of", col), x = col, y = "Count") +
      theme_minimal()
  }
  print(plot)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Let’s create further visualizations keeping in mind the variabkes are numerical and categorical

# Numeric Variables
numeric_cols <- c("Age", "AverageLeadTime", "LodgingRevenue", "OtherRevenue", 
                  "BookingsCanceled", "BookingsNoShowed", "BookingsCheckedIn", "PersonsNights", "RoomNights", 
                  "DaysSinceLastStay", "DaysSinceFirstStay")

# Create boxplots for numeric variables
for (col in numeric_cols) {
  plot <- ggplot(HotelLisbon_data, aes(y = as.numeric(HotelLisbon_data[[col]]))) +
    geom_boxplot(fill = 'blue', color = 'black') +
    labs(title = paste("Boxplot of", col), y = col) +
    theme_minimal()

  # Highlight outliers
  outliers <- boxplot.stats(HotelLisbon_data[[col]])$out
  if (length(outliers) > 0) {
    plot <- plot + geom_point(data = data.frame(y = outliers), aes(x = 1, y = y), color = 'red', size = 3)
  }

  print(plot)
}

# Categorical Variables
categorical_cols <- c("Nationality", "DistributionChannel", "MarketSegment", "SRHighFloor", "SRLowFloor", 
                      "SRAccessibleRoom", "SRMediumFloor", "SRBathtub", "SRShower", "SRCrib", "SRKingSizeBed", 
                      "SRTwinBed", "SRNearElevator", "SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")

# Create bar plots for categorical variables
for (col in categorical_cols) {
  plot <- ggplot(HotelLisbon_data, aes_string(x = col)) +
    geom_bar(fill = 'blue', color = 'black') +
    labs(title = paste("Bar Chart of", col), x = col, y = "Count") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

  print(plot)
}

Our Business question is : #At Hotel Lisbon, how many customers are cancelling bookings”BookingsCanceled” and Not-showing up “BookingsNoShowed” through customer data habits recorded in Hotel Lisbon? And what must the Hotliers do to decrease the number of cancellations and No-shows?

#Now let’s create Box plots to investigate outliers

library(ggplot2)
library(dplyr)


all_columns <- names(HotelLisbon_data)


for (col in all_columns) {
  
  if (is.numeric(HotelLisbon_data[[col]])) {
    plot <- ggplot(HotelLisbon_data, aes_string(y = col)) +
      geom_boxplot(fill = 'blue', color = 'black') +
      labs(title = paste("Box Plot of", col), y = col, x = "") +
      theme_minimal()
    
    print(plot)
  }
}

#by investigating the dataset, we have found out Age has some “Null” values and there are some negative values, which should be cleaned or further investigated.

#WE need to create a better categorical table for customer ID, so that we know if there are any repeated customers.

The variables “DaysSinceLastStay and DaysSinceFirstStay” have negative values which at the moment don’t make much sense.


#Let’s see if BookingsCanceled and BookingsNoshowed are binary or not

# Check unique values for BookingsCanceled
unique_values_canceled <- unique(HotelLisbon_data$BookingsCanceled)

# Check unique values for BookingsNoShowed
unique_values_no_show <- unique(HotelLisbon_data$BookingsNoShowed)

# Check unique values for BookingsNoShowed
unique_values_checkedIn <- unique(HotelLisbon_data$BookingsCheckedIn)

# Print the results
cat("Unique values for BookingsCanceled:", unique_values_canceled, "\n")
## Unique values for BookingsCanceled: 0 2 1 4 3 9
cat("Unique values for BookingsNoShowed:", unique_values_no_show, "\n")
## Unique values for BookingsNoShowed: 0 1 2 3
cat("Unique values for BookingsCheckedIn:", unique_values_checkedIn, "\n")
## Unique values for BookingsCheckedIn: 1 0 2 3 8 4 7 12 10 5 34 6 13 9 11 29 14 57 19 15 20 40 23 26 25 17 18
# Check if they are binary
is_binary_canceled <- length(unique_values_canceled) == 2
is_binary_no_show <- length(unique_values_no_show) == 2
is_binary_checkedIn <- length(unique_values_checkedIn) == 2

# Print the results
cat("BookingsCanceled is binary:", is_binary_canceled, "\n")
## BookingsCanceled is binary: FALSE
cat("BookingsNoShowed is binary:", is_binary_no_show, "\n")
## BookingsNoShowed is binary: FALSE
cat("BookingsCheckedIN is binary:", is_binary_checkedIn, "\n")
## BookingsCheckedIN is binary: FALSE
summary(HotelLisbon_data$BookingsCanceled)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.001987 0.000000 9.000000
summary(HotelLisbon_data$BookingsNoShowed)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0000000 0.0000000 0.0005867 0.0000000 3.0000000
summary(HotelLisbon_data$BookingsCheckedIn)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  1.0000  1.0000  0.7934  1.0000 57.0000
# Check the class of variables
class_canceled <- class(HotelLisbon_data$BookingsCanceled)
class_checked_in <- class(HotelLisbon_data$BookingsCheckedIn)
class_no_show <- class(HotelLisbon_data$BookingsNoShowed)

# Print the results
cat("Class of BookingsCanceled:", class_canceled, "\n")
## Class of BookingsCanceled: numeric
cat("Class of BookingsCheckedIn:", class_checked_in, "\n")
## Class of BookingsCheckedIn: numeric
cat("Class of BookingsNoShowed:", class_no_show, "\n")
## Class of BookingsNoShowed: numeric

#Analyze ‘DaysSinceFirstStay’ and ‘DaysSinceLastStay’ to understand repeat customer behavior.

summary(HotelLisbon_data$DaysSinceFirstStay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    -1.0    27.0   369.0   403.1   698.0  1186.0
summary(HotelLisbon_data$DaysSinceLastStay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    -1.0    26.0   366.0   400.9   694.0  1104.0
summary(HotelLisbon_data$AverageLeadTime)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1.00    0.00   29.00   66.21  103.00  588.00
summary(HotelLisbon_data$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -11.0    34.0    46.0    45.4    56.0   114.0

#by looking at the data table, in some observations of DaysSinceLastStay and DaysSinceFirstStay the value written in -1 due to this AverageLeadTime is 0 at these observations. #how do we resolve this. ————————————————————————————————————————————– #By observing the data set, I can see that -1 written in DayssinceLastStay and DaysSinceFirstStay. This is are wrong. As the data dictioary says:

#DaysSinceLastStay

#The number of days elapsed between the last day of the extraction and the customer’s last arrival date (of a checked-in booking). A value of 1 indicates the customer never stayed at the hotel

#DaysSinceFirstStay

#the customer’s first arrival date (of a checked-in booking). A value of The number of days elapsed between the last day of the extraction and 1 indicates the customer never stayed at the hotel

This means it needs to be put as 1 and remove the negative.

#AverageLeadTime #The average number of days elapsed between the customer’s booking date and arrival date. In other words, this variable is calculated by dividing the sum of the number of days elapsed between the moment each booking was made and its arrival date, by the total of bookings made by the customer

#looking at BookingsCheckedIn, BookinksCanceled and BookingsNoshows of customers with -1 in DaysSinceFirstStay and DaysSinceLastStay, they are all 0, which is not right. If the customer did not show up, it means that BookingsCancled or No-Shows should say 1. But they are 0. this needs to corrected.

We need to make a new binary variable, from looking at No shows and Cancellations. and evaluate if they are indeed Binary.

We also have to shorten our collection of variables

#do we need Lodging Revenue and Other Revenue and transform them into log

#Filtering or Correcting Negative Values, Single digit in ‘Age’, ‘AverageLeadTime’, ‘DaysSinceLastStay’, and ’DaysSinceFirstStay’and removing negative sign from DaysSinceFisrtStay and DaysSinceLaststay so that BookingsCheckIn is 0 and BookingsNoShowed and BookingsCanceled is 1

# Convert Age to numeric
HotelLisbon_data$Age <- as.numeric(as.character(HotelLisbon_data$Age))

# Replace negative values and single-digit ages with NA
HotelLisbon_data$Age[HotelLisbon_data$Age < 0 | HotelLisbon_data$Age < 16] <- NA

# Replace missing values in Age with median
median_age <- median(HotelLisbon_data$Age, na.rm = TRUE)
HotelLisbon_data$Age[is.na(HotelLisbon_data$Age)] <- median_age

# Print the median age
cat("Median Age:", median_age, "\n")
## Median Age: 46
# Replace -1 in DaysSinceLastStay and DaysSinceFirstStay with 1
HotelLisbon_data$DaysSinceLastStay[HotelLisbon_data$DaysSinceLastStay == -1] <- 1
HotelLisbon_data$DaysSinceFirstStay[HotelLisbon_data$DaysSinceFirstStay == -1] <- 1

# Update BookingsCheckedIn, BookingsCanceled, and BookingsNoShowed
HotelLisbon_data$BookingsCheckedIn[HotelLisbon_data$DaysSinceFirstStay == 1 | HotelLisbon_data$DaysSinceLastStay == 1] <- 0
HotelLisbon_data$BookingsNoShowed[HotelLisbon_data$DaysSinceFirstStay == 1 | HotelLisbon_data$DaysSinceLastStay == 1] <- 1
HotelLisbon_data$BookingsCanceled[HotelLisbon_data$DaysSinceFirstStay == 1 | HotelLisbon_data$DaysSinceLastStay == 1] <- 1

# Check for missing values after replacement
sum(is.na(HotelLisbon_data$Age))
## [1] 0
summary(HotelLisbon_data$DaysSinceLastStay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    26.0   366.0   401.4   694.0  1104.0
summary(HotelLisbon_data$DaysSinceFirstStay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    27.0   369.0   403.6   698.0  1186.0
summary(HotelLisbon_data$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   37.00   46.00   47.04   56.00  114.00
summary(HotelLisbon_data$BookingsCanceled)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2405  0.0000  9.0000
summary(HotelLisbon_data$BookingsNoShowed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2391  0.0000  3.0000
summary(HotelLisbon_data$BookingsCheckedIn)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  1.0000  1.0000  0.7933  1.0000 57.0000

#The negaitvity in these variables have been resolved. the single digit ages have also changed to above the age of 16. #BookingsCanceled and Booking No showed now have a value of 1 where instaed it was zero becuase of -1 written in DayssinceFirst Stay and Days Since last stay.

#Let’s check again the unique values of Bookings checked In, Bookings canceled and Bookings No showed.

# Check unique values for BookingsCanceled
New_unique_values_canceled <- unique(HotelLisbon_data$BookingsCanceled)

# Check unique values for BookingsNoShowed
New_unique_values_no_show <- unique(HotelLisbon_data$BookingsNoShowed)

# Check unique values for BookingsNoShowed
New_unique_values_checkedIn <- unique(HotelLisbon_data$BookingsCheckedIn)

# Print the results
cat("New Unique values for BookingsCanceled:", New_unique_values_canceled, "\n")
## New Unique values for BookingsCanceled: 0 1 4 3 2 9
cat("New Unique values for BookingsNoShowed:", New_unique_values_no_show, "\n")
## New Unique values for BookingsNoShowed: 0 1 2 3
cat("New Unique values for BookingsCheckedIn:", New_unique_values_checkedIn, "\n")
## New Unique values for BookingsCheckedIn: 1 0 2 3 8 4 7 12 10 5 34 6 13 9 11 29 14 57 19 15 20 40 23 26 25 17 18

#Let’s count each variables frequency for these unique values

# Count the frequency of each numeric value in BookingsCanceled
frequency_canceled <- table(HotelLisbon_data$BookingsCanceled)

# Count the frequency of each numeric value in BookingsNoShowed
frequency_no_show <- table(HotelLisbon_data$BookingsNoShowed)

# Count the frequency of each numeric value in BookingsCheckedIn
frequency_checked_in <- table(HotelLisbon_data$BookingsCheckedIn)

# Print the results
cat("Frequency of each numeric value in BookingsCanceled:\n", frequency_canceled, "\n\n")
## Frequency of each numeric value in BookingsCanceled:
##  57004 17976 9 7 3 1
cat("Frequency of each numeric value in BookingsNoShowed:\n", frequency_no_show, "\n\n")
## Frequency of each numeric value in BookingsNoShowed:
##  57074 17920 5 1
cat("Frequency of each numeric value in BookingsCheckedIn:\n", frequency_checked_in, "\n")
## Frequency of each numeric value in BookingsCheckedIn:
##  17889 55813 1017 121 57 18 17 16 9 7 6 6 4 4 2 2 1 1 1 1 1 1 1 2 1 1 1
# Count the frequency of each numeric value in BookingsCanceled
frequency_canceled <- as.data.frame(table(HotelLisbon_data$BookingsCanceled))
cat("Frequency of each numeric value in BookingsCanceled:\n")
## Frequency of each numeric value in BookingsCanceled:
for (i in 1:nrow(frequency_canceled)) {
  cat(paste(frequency_canceled$Var1[i], ":", frequency_canceled$Freq[i]), "\n")
}
## 0 : 57004 
## 1 : 17976 
## 2 : 9 
## 3 : 7 
## 4 : 3 
## 9 : 1
cat("\n")
# Count the frequency of each numeric value in BookingsNoShowed
frequency_no_show <- as.data.frame(table(HotelLisbon_data$BookingsNoShowed))
cat("Frequency of each numeric value in BookingsNoShowed:\n")
## Frequency of each numeric value in BookingsNoShowed:
for (i in 1:nrow(frequency_no_show)) {
  cat(paste(frequency_no_show$Var1[i], ":", frequency_no_show$Freq[i]), "\n")
}
## 0 : 57074 
## 1 : 17920 
## 2 : 5 
## 3 : 1
cat("\n")
# Count the frequency of each numeric value in BookingsCheckedIn
frequency_checked_in <- as.data.frame(table(HotelLisbon_data$BookingsCheckedIn))
cat("Frequency of each numeric value in BookingsCheckedIn:\n")
## Frequency of each numeric value in BookingsCheckedIn:
for (i in 1:nrow(frequency_checked_in)) {
  cat(paste(frequency_checked_in$Var1[i], ":", frequency_checked_in$Freq[i]), "\n")
}
## 0 : 17889 
## 1 : 55813 
## 2 : 1017 
## 3 : 121 
## 4 : 57 
## 5 : 18 
## 6 : 17 
## 7 : 16 
## 8 : 9 
## 9 : 7 
## 10 : 6 
## 11 : 6 
## 12 : 4 
## 13 : 4 
## 14 : 2 
## 15 : 2 
## 17 : 1 
## 18 : 1 
## 19 : 1 
## 20 : 1 
## 23 : 1 
## 25 : 1 
## 26 : 1 
## 29 : 2 
## 34 : 1 
## 40 : 1 
## 57 : 1

#BookingsCanceled:

The majority of customers (57,004 out of 75,000) did not cancel their bookings (0). A significant number of customers (17,976) canceled their bookings once (1). There are a few cases where customers canceled multiple bookings (2, 3, 4, 9 times).

#BookingsNoShowed:

The majority of customers (57,074 out of 75,000) did not have a “no-show” (0). A significant number of customers (17,920) had one “no-show” incident (1). There are a few cases where customers had multiple “no-show” incidents (2, 3).

#BookingsCheckedIn:

A large number of customers (55,813 out of 75,000) checked in once (1). There are customers who checked in multiple times, with various frequencies. Based on the data dictionary and the understanding of the variables: BookingsCheckedIn: 0: 17,889 customers did not check in for any bookings. 1: 55,813 customers checked in once. 2: 1,017 customers checked in twice. 3: 121 customers checked in thrice. 4: 57 customers checked in four times. 5-15: There is a decreasing trend in the number of customers who checked in for more bookings, with decreasing frequency. 17-40: The frequency further decreases for customers who checked in for a higher number of bookings. 57: There is one customer who checked in 57 times. Analysis:

The majority of customers (55,813) checked in for just one booking, indicating that a significant portion of customers had a single stay event. The number of customers decreases as the number of check-ins increases, suggesting that fewer customers have multiple stay events. There are outliers or rare cases where a small number of customers have a very high number of check-ins, which might be unusual or require further investigation. This variable provides insights into customer behavior regarding the frequency of stays, with the majority having a limited number of stay

Let’s create new categories for Bookings checked In, Bookings Canceled and Bookings No show, so that we have less categoires and a more concise description for these customers.

# Creating a function to categorize BookingsCheckedIn
categorize_checked_in <- function(count) {
  if (count == 0) {
    return("Booked but Not Checked In")
  } else if (count == 1) {
    return("Stayed Once")
  } else if (count == 2) {
    return("Stayed Twice")
  } else if (count == 3) {
    return("Stayed Thrice")
  } else if (count >= 4 & count <= 15) {
    return("Good Loyal Customers")
  } else if (count >= 17 & count <= 57) {
    return("Great Loyal Customers")
  } else {
    return("Undefined Category")
  }
}

# creating a new variable for BookingsCheckedIn
HotelLisbon_data$CheckedInCategory <- sapply(HotelLisbon_data$BookingsCheckedIn, categorize_checked_in)

# Now a function to categorize BookingsCanceled
categorize_canceled <- function(count) {
  if (count == 0) {
    return("Checked In and Stayed")
  } else if (count >= 1 & count <= 9) {
    return("Canceled Bookings")
  } else {
    return("Undefined Category")
  }
}

# creating a new variable for BookingsCanceled
HotelLisbon_data$CanceledCategory <- sapply(HotelLisbon_data$BookingsCanceled, categorize_canceled)

# And then a function to categorize BookingsNoShowed
categorize_no_show <- function(count) {
  if (count == 0) {
    return("Customers Showed up and Stayed")
  } else if (count >= 1 & count <= 3) {
    return("No Show Customers")
  } else {
    return("Undefined Category")
  }
}

# creating a new variable for BookingsNoShowed
HotelLisbon_data$NoShowCategory <- sapply(HotelLisbon_data$BookingsNoShowed, categorize_no_show)

# Viewing the results
head(HotelLisbon_data[, c("CheckedInCategory", "CanceledCategory", "NoShowCategory")])
## # A tibble: 6 × 3
##   CheckedInCategory         CanceledCategory      NoShowCategory                
##   <chr>                     <chr>                 <chr>                         
## 1 Stayed Once               Checked In and Stayed Customers Showed up and Stayed
## 2 Stayed Once               Checked In and Stayed Customers Showed up and Stayed
## 3 Stayed Once               Checked In and Stayed Customers Showed up and Stayed
## 4 Stayed Once               Checked In and Stayed Customers Showed up and Stayed
## 5 Stayed Once               Checked In and Stayed Customers Showed up and Stayed
## 6 Booked but Not Checked In Canceled Bookings     No Show Customers
# Frequency of each customer in CheckedInCategory
frequency_checked_in_category <- as.data.frame(table(HotelLisbon_data$CheckedInCategory))

# Frequency of each customer in CanceledCategory
frequency_canceled_category <- as.data.frame(table(HotelLisbon_data$CanceledCategory))

# Frequency of each customer in NoShowCategory
frequency_no_show_category <- as.data.frame(table(HotelLisbon_data$NoShowCategory))

# View the results
print("Frequency of each customer in CheckedInCategory:")
## [1] "Frequency of each customer in CheckedInCategory:"
print(frequency_checked_in_category)
##                        Var1  Freq
## 1 Booked but Not Checked In 17889
## 2      Good Loyal Customers   148
## 3     Great Loyal Customers    12
## 4               Stayed Once 55813
## 5             Stayed Thrice   121
## 6              Stayed Twice  1017
cat("\n")
print("Frequency of each customer in CanceledCategory:")
## [1] "Frequency of each customer in CanceledCategory:"
print(frequency_canceled_category)
##                    Var1  Freq
## 1     Canceled Bookings 17996
## 2 Checked In and Stayed 57004
cat("\n")
print("Frequency of each customer in NoShowCategory:")
## [1] "Frequency of each customer in NoShowCategory:"
print(frequency_no_show_category)
##                             Var1  Freq
## 1 Customers Showed up and Stayed 57074
## 2              No Show Customers 17926

#we have created three new variables that are characteristic in nature for making the readings of Booking frequency more presentable.

Now we have clearly categorized the customers Booking, cancel and Now Show data.

#Let’s make a new Binary variable called CustomerOutsome from variables BookingsCheckedIn, BookingsCanceled and BookingsNoShowed

# Create a new variable CustomerOutcome
HotelLisbon_data$CustomerOutcome <- ifelse(HotelLisbon_data$BookingsCanceled > 0 | HotelLisbon_data$BookingsNoShowed > 0, 0, 1)

# Display the unique values of CustomerOutcome
unique(HotelLisbon_data$CustomerOutcome)
## [1] 1 0
table(HotelLisbon_data$CustomerOutcome)
## 
##     0     1 
## 18020 56980

We have succecfully created binary variable “CustomerOutcome” #It shows that 18020 customers did not show up or canceled thier resevartions # 56980 customers booked and stayed at the Hotel

#Data dimesion:

ncol(HotelLisbon_data)
## [1] 36
colnames(HotelLisbon_data)
##  [1] "...1"                 "ID"                   "Nationality"         
##  [4] "Age"                  "DaysSinceCreation"    "NameHash"            
##  [7] "DocIDHash"            "AverageLeadTime"      "LodgingRevenue"      
## [10] "OtherRevenue"         "BookingsCanceled"     "BookingsNoShowed"    
## [13] "BookingsCheckedIn"    "PersonsNights"        "RoomNights"          
## [16] "DaysSinceLastStay"    "DaysSinceFirstStay"   "DistributionChannel" 
## [19] "MarketSegment"        "SRHighFloor"          "SRLowFloor"          
## [22] "SRAccessibleRoom"     "SRMediumFloor"        "SRBathtub"           
## [25] "SRShower"             "SRCrib"               "SRKingSizeBed"       
## [28] "SRTwinBed"            "SRNearElevator"       "SRAwayFromElevator"  
## [31] "SRNoAlcoholInMiniBar" "SRQuietRoom"          "CheckedInCategory"   
## [34] "CanceledCategory"     "NoShowCategory"       "CustomerOutcome"
sapply(HotelLisbon_data, class)
##                 ...1                   ID          Nationality 
##            "numeric"            "numeric"             "factor" 
##                  Age    DaysSinceCreation             NameHash 
##            "numeric"            "numeric"          "character" 
##            DocIDHash      AverageLeadTime       LodgingRevenue 
##          "character"            "numeric"            "numeric" 
##         OtherRevenue     BookingsCanceled     BookingsNoShowed 
##            "numeric"            "numeric"            "numeric" 
##    BookingsCheckedIn        PersonsNights           RoomNights 
##            "numeric"            "numeric"            "numeric" 
##    DaysSinceLastStay   DaysSinceFirstStay  DistributionChannel 
##            "numeric"            "numeric"             "factor" 
##        MarketSegment          SRHighFloor           SRLowFloor 
##             "factor"             "factor"             "factor" 
##     SRAccessibleRoom        SRMediumFloor            SRBathtub 
##             "factor"             "factor"             "factor" 
##             SRShower               SRCrib        SRKingSizeBed 
##             "factor"             "factor"             "factor" 
##            SRTwinBed       SRNearElevator   SRAwayFromElevator 
##             "factor"             "factor"             "factor" 
## SRNoAlcoholInMiniBar          SRQuietRoom    CheckedInCategory 
##             "factor"             "factor"          "character" 
##     CanceledCategory       NoShowCategory      CustomerOutcome 
##          "character"          "character"            "numeric"
summary(HotelLisbon_data$Nationality) 
##     FRA     PRT     DEU     GBR     ESP     USA     ITA     BEL     BRA     NLD 
##   11170   10365    9242    7750    4399    3076    3007    2806    2564    2461 
##     CHE     IRL     CAN     AUT     SWE     CHN     ISR     NOR     POL     AUS 
##    1913    1780    1364    1334    1119     816     802     708     662     638 
##     DNK     FIN     RUS     ROU     HUN     JPN     CZE     GRC     LUX     IND 
##     594     589     529     432     289     245     224     216     205     185 
##     KOR     AGO     MEX     MAR     ARG     BGR     TUR     SRB     HRV     EST 
##     176     162     156     155     154     143     141     136     127     112 
##     UKR     NZL     LVA     MOZ     DZA     IRN     TWN     SVK     CHL     COL 
##      88      84      83      70      69      68      67      66      65      62 
##     LTU     ZAF     SVN     CYP     ISL     PHL     AZE     SGP     PER     THA 
##      60      55      54      52      51      46      43      39      37      35 
##     MLT     LBN     URY     SAU     BLR     ARE     VNM     TUN     VEN     CMR 
##      33      31      31      29      28      27      27      25      23      22 
##     BIH     ECU     EGY     IDN     MYS     KAZ     PAN     JOR     MKD     CRI 
##      21      20      20      18      18      16      16      15      14      13 
##     DOM     NGA     ALB     PAK     KWT     ARM     CPV     GNB     IRQ     KEN 
##      13      13      12      12      11      10      10      10      10      10 
##     PRY     BOL     GEO     ATF     BHR     CUB     LKA     AND     BGD (Other) 
##      10       9       9       8       8       8       8       7       7     228

#Nationalties is a large data oc countires. Lets’ group these countries into continents

install.packages("countrycode")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'countrycode' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
library(countrycode)
## Warning: package 'countrycode' was built under R version 4.3.2
# Load the dplyr package for data manipulation
library(dplyr)

# Create a named vector for continent mapping
continent_mapping <- c(
  BRA = 'SA', CAN = 'NA', PHL = 'AS', FRA = 'EU', HUN = 'EU', 
  ITA = 'EU', EST = 'EU', BEL = 'EU', PRT = 'EU', GBR = 'EU', 
  USA = 'NA', CHN = 'AS', IRL = 'EU', DEU = 'EU', CHE = 'EU', 
  AUS = 'OC', ESP = 'EU', RUS = 'EU', AUT = 'EU', ROU = 'EU', 
  NLD = 'EU', TUR = 'EU', ARG = 'SA', CZE = 'EU', CYP = 'EU', 
  FIN = 'EU', POL = 'EU', NOR = 'EU', JPN = 'AS', NZL = 'OC', 
  CAF = 'AF', SRB = 'EU', BGR = 'EU', SYR = 'AS', UKR = 'EU', 
  VEN = 'SA', SWE = 'EU', ISR = 'AS', DNK = 'EU', URY = 'SA',
  MAR = 'AF', AGO = 'AF', IND = 'AS', CHL = 'SA', BHR = 'AS', 
  MEX = 'NA', THA = 'AS', PAK = 'AS', IDN = 'AS', HRV = 'EU', 
  LUX = 'EU', ARE = 'AS', TUN = 'AF', LVA = 'EU', UZB = 'AS', 
  TGO = 'AF', SGP = 'AS', BLR = 'EU', GRC = 'EU', ARM = 'AS', 
  DZA = 'AF', GNB = 'AF', SVK = 'EU', CRI = 'NA', EGY = 'AF', 
  DOM = 'NA', IRN = 'AS', SVN = 'EU', ZAF = 'AF', MKD = 'EU', 
  HKG = 'AS', ISL = 'EU', MDV = 'AS', MOZ = 'AF', IRQ = 'AS', 
  MYS = 'AS', LCA = 'NA', KOR = 'AS', SUR = 'SA', LTU = 'EU', 
  PRY = 'SA', BOL = 'SA', TWN = 'AS', STP = 'AF', MMR = 'AS', 
  CMR = 'AF', SAU = 'AS', KWT = 'AS', AZE = 'AS', JEY = 'EU', 
  COL = 'SA', ALB = 'EU', PAN = 'NA', LBN = 'AS', ECU = 'SA', 
  NGA = 'AF', MUS = 'AF', MLT = 'EU', BIH = 'EU', KAZ = 'AS', 
  LBY = 'AF', CPV = 'AF', LKA = 'AS', ATA = 'AN', PER = 'SA', 
  CIV = 'AF', VNM = 'AS', MNE = 'EU', COM = 'AF', OMN = 'AS', 
  QAT = 'AS', GAB = 'AF', COD = 'AF', BGD = 'AS', NAM = 'AF', 
  HTI = 'NA', GEO = 'AS', GIB = 'EU', JOR = 'AS', SYC = 'AF', 
  LIE = 'EU', SEN = 'AF', ATF = 'AN', KEN = 'AF', VIR = 'NA', 
  PYF = 'OC', UGA = 'AF', TZA = 'AF', SMR = 'EU', KGZ = 'AS', 
  PRI = 'NA', NCL = 'OC', BWA = 'AF', GTM = 'NA', BRB = 'NA', 
  MWI = 'AF', NIC = 'NA', LAO = 'AS', MLI = 'AF', RWA = 'AF', 
  ASM = 'OC', DMA = 'NA', MRT = 'AF', AIA = 'NA', CUB = 'NA', 
  SDN = 'AF', JAM = 'NA', TKM = 'AS', SWZ = 'AF', MCO = 'EU', 
  WSM = 'OC', AND = 'EU', KNA = 'NA', ERI = 'AF', BEN = 'AF', 
  SLV = 'NA', GUF = 'SA', ABW = 'NA', FRO = 'EU', ZWE = 'AF', 
  ATG = 'NA', SLE = 'AF', GUY = 'SA', TCD = 'AF', FLK = 'SA', 
  SPM = 'NA', SOM = 'AF', GHA = 'AF', UMI = 'OC', TJK = 'AS', 
  ETH = 'AF', KIR = 'OC', PCN = 'OC', LAO = 'AS', MNG = 'AS', 
  BTN = 'AS', MHL = 'OC', KI = 'OC', VUT = 'OC', TLS = 'AS', 
  FJI = 'OC', COK = 'OC', NRU = 'OC', TUV = 'OC', SLB = 'OC', 
  MNP = 'OC', FSM = 'OC', PLW = 'OC', MAF = 'NA', SXM = 'NA', 
  CUW = 'NA', BES = 'NA', ABW = 'NA', SGS = 'AN', BVT = 'AN', 
  CXR = 'AS', CCK = 'AS', HMD = 'AN', NFK = 'OC', ATA = 'AN'
)

# Convert Nationality to character type
HotelLisbon_data$Nationality <- as.character(HotelLisbon_data$Nationality)

# Add a new column 'Continent' to the dataset
HotelLisbon_data <- HotelLisbon_data %>%
  mutate(Continent = continent_mapping[Nationality])

# Check for any countries that were not categorized
missing_continents <- unique(HotelLisbon_data$Nationality[is.na(HotelLisbon_data$Continent)])

# Print missing continents to see which countries need to be categorized
print(missing_continents)
##  [1] "TON" "IOT" "WLF" "BHS" "MDG" "GNQ" "YEM" "PNG" "VCT" "NPL" "GIN"
# Add a new column 'Continent' to the dataset
HotelLisbon_data <- HotelLisbon_data %>%
  mutate(Continent = continent_mapping[Nationality])

# Manually assign continents for the specified countries
manual_continents <- c(
  "TON" = "AF",
  "IOT" = "AS",
  "WLF" = "OC",
  "BHS" = "NA",
  "MDG" = "AF",
  "GNQ" = "AF",
  "YEM" = "AS",
  "PNG" = "OC",
  "VCT" = "NA",
  "NPL" = "AS",
  "GIN" = "AF"
)

# Update the Continent column for the manually assigned countries
HotelLisbon_data <- HotelLisbon_data %>%
  mutate(Continent = ifelse(Nationality %in% names(manual_continents), manual_continents[Nationality], Continent))

# Check for any countries that were not categorized
missing_continents <- unique(HotelLisbon_data$Nationality[is.na(HotelLisbon_data$Continent)])

# Print missing continents to see which countries need to be categorized
print(missing_continents)
## character(0)

We have managed to manually input all countries into a new variable called continent.

# Convert 'Continent' column to a factor
HotelLisbon_data$Continent <- as.factor(HotelLisbon_data$Continent)

# Check the structure of the dataset to confirm the changes
str(HotelLisbon_data)
## tibble [75,000 × 37] (S3: tbl_df/tbl/data.frame)
##  $ ...1                : num [1:75000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ ID                  : num [1:75000] 20351 62663 30398 39784 17929 ...
##  $ Nationality         : chr [1:75000] "BRA" "CAN" "PHL" "FRA" ...
##  $ Age                 : num [1:75000] 85 30 70 31 29 46 49 55 53 47 ...
##  $ DaysSinceCreation   : num [1:75000] 733 178 564 430 785 314 794 237 817 750 ...
##  $ NameHash            : chr [1:75000] "0x0BF2ECC3BF14F7FF3F926275E9BAAFAFF5823E69F81319DCCC5867DC986E10DC" "0xE4899D5F1CF2354CE1EBCD1717CE2EC2D91DE694C9118ADA37CB726A3F43DE22" "0xA1D0401C1635B389B99596B21D6C463B6E63444457B010DDD3D31AC1CE19C2ED" "0x0C948619213E11A1EB2E326CA64277BAFADDDDE51D7D2EB9DA7B71295C8704BC" ...
##  $ DocIDHash           : chr [1:75000] "0x44749B4F7510099B0A4BEF85DE72E75ABD3CC90896949AAC4EF1A46598DCE490" "0x78C451F6556F7129351AE28B3BA7DD499E258DDBEC31F89302AE62899301DB4A" "0x613A9E9859B7CA68E8D3613BE4B2880059B2A5134E2B2B8C85186EAC073A3AC8" "0x56D89BC906AA74D89F63D75436A2BBC0B2DE9EAD2D49CEA970713BD02290AE54" ...
##  $ AverageLeadTime     : num [1:75000] 41 119 94 47 148 0 33 230 213 157 ...
##  $ LodgingRevenue      : num [1:75000] 53 1041 1512 219 269 ...
##  $ OtherRevenue        : num [1:75000] 14 162 72 146 58.5 ...
##  $ BookingsCanceled    : num [1:75000] 0 0 0 0 0 1 0 0 0 0 ...
##  $ BookingsNoShowed    : num [1:75000] 0 0 0 0 0 1 0 0 0 0 ...
##  $ BookingsCheckedIn   : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
##  $ PersonsNights       : num [1:75000] 2 6 8 6 8 0 2 5 8 6 ...
##  $ RoomNights          : num [1:75000] 1 3 4 3 4 0 2 5 4 3 ...
##  $ DaysSinceLastStay   : num [1:75000] 734 181 568 433 789 1 796 242 821 753 ...
##  $ DaysSinceFirstStay  : num [1:75000] 734 181 568 433 789 1 796 242 821 753 ...
##  $ DistributionChannel : Factor w/ 4 levels "Corporate","Direct",..: 4 4 4 4 2 4 2 4 4 4 ...
##  $ MarketSegment       : Factor w/ 7 levels "Aviation","Complementary",..: 7 6 6 7 4 6 4 5 6 7 ...
##  $ SRHighFloor         : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ SRLowFloor          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRAccessibleRoom    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRMediumFloor       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRBathtub           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRShower            : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRCrib              : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
##  $ SRKingSizeBed       : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 2 1 ...
##  $ SRTwinBed           : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
##  $ SRNearElevator      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRAwayFromElevator  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRNoAlcoholInMiniBar: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRQuietRoom         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
##  $ CheckedInCategory   : chr [1:75000] "Stayed Once" "Stayed Once" "Stayed Once" "Stayed Once" ...
##  $ CanceledCategory    : chr [1:75000] "Checked In and Stayed" "Checked In and Stayed" "Checked In and Stayed" "Checked In and Stayed" ...
##  $ NoShowCategory      : chr [1:75000] "Customers Showed up and Stayed" "Customers Showed up and Stayed" "Customers Showed up and Stayed" "Customers Showed up and Stayed" ...
##  $ CustomerOutcome     : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
##  $ Continent           : Factor w/ 7 levels "AF","AN","AS",..: 7 5 3 4 4 4 4 4 4 4 ...
# Check the class of each variable
variable_classes <- sapply(HotelLisbon_data, class)

# Print the result
print(variable_classes)
##                 ...1                   ID          Nationality 
##            "numeric"            "numeric"          "character" 
##                  Age    DaysSinceCreation             NameHash 
##            "numeric"            "numeric"          "character" 
##            DocIDHash      AverageLeadTime       LodgingRevenue 
##          "character"            "numeric"            "numeric" 
##         OtherRevenue     BookingsCanceled     BookingsNoShowed 
##            "numeric"            "numeric"            "numeric" 
##    BookingsCheckedIn        PersonsNights           RoomNights 
##            "numeric"            "numeric"            "numeric" 
##    DaysSinceLastStay   DaysSinceFirstStay  DistributionChannel 
##            "numeric"            "numeric"             "factor" 
##        MarketSegment          SRHighFloor           SRLowFloor 
##             "factor"             "factor"             "factor" 
##     SRAccessibleRoom        SRMediumFloor            SRBathtub 
##             "factor"             "factor"             "factor" 
##             SRShower               SRCrib        SRKingSizeBed 
##             "factor"             "factor"             "factor" 
##            SRTwinBed       SRNearElevator   SRAwayFromElevator 
##             "factor"             "factor"             "factor" 
## SRNoAlcoholInMiniBar          SRQuietRoom    CheckedInCategory 
##             "factor"             "factor"          "character" 
##     CanceledCategory       NoShowCategory      CustomerOutcome 
##          "character"          "character"            "numeric" 
##            Continent 
##             "factor"

#Dropping variables

# Dropping specific variables
HotelLisbon_data <- HotelLisbon_data %>%
  select(-c(...1, Nationality, DaysSinceCreation, NameHash, DocIDHash, AverageLeadTime, DaysSinceLastStay, DaysSinceFirstStay))
str(HotelLisbon_data)
## tibble [75,000 × 29] (S3: tbl_df/tbl/data.frame)
##  $ ID                  : num [1:75000] 20351 62663 30398 39784 17929 ...
##  $ Age                 : num [1:75000] 85 30 70 31 29 46 49 55 53 47 ...
##  $ LodgingRevenue      : num [1:75000] 53 1041 1512 219 269 ...
##  $ OtherRevenue        : num [1:75000] 14 162 72 146 58.5 ...
##  $ BookingsCanceled    : num [1:75000] 0 0 0 0 0 1 0 0 0 0 ...
##  $ BookingsNoShowed    : num [1:75000] 0 0 0 0 0 1 0 0 0 0 ...
##  $ BookingsCheckedIn   : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
##  $ PersonsNights       : num [1:75000] 2 6 8 6 8 0 2 5 8 6 ...
##  $ RoomNights          : num [1:75000] 1 3 4 3 4 0 2 5 4 3 ...
##  $ DistributionChannel : Factor w/ 4 levels "Corporate","Direct",..: 4 4 4 4 2 4 2 4 4 4 ...
##  $ MarketSegment       : Factor w/ 7 levels "Aviation","Complementary",..: 7 6 6 7 4 6 4 5 6 7 ...
##  $ SRHighFloor         : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ SRLowFloor          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRAccessibleRoom    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRMediumFloor       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRBathtub           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRShower            : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRCrib              : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
##  $ SRKingSizeBed       : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 2 1 ...
##  $ SRTwinBed           : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
##  $ SRNearElevator      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRAwayFromElevator  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRNoAlcoholInMiniBar: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SRQuietRoom         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
##  $ CheckedInCategory   : chr [1:75000] "Stayed Once" "Stayed Once" "Stayed Once" "Stayed Once" ...
##  $ CanceledCategory    : chr [1:75000] "Checked In and Stayed" "Checked In and Stayed" "Checked In and Stayed" "Checked In and Stayed" ...
##  $ NoShowCategory      : chr [1:75000] "Customers Showed up and Stayed" "Customers Showed up and Stayed" "Customers Showed up and Stayed" "Customers Showed up and Stayed" ...
##  $ CustomerOutcome     : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
##  $ Continent           : Factor w/ 7 levels "AF","AN","AS",..: 7 5 3 4 4 4 4 4 4 4 ...

#Let’s change the variables into specific class we want

# Convert ID to character
HotelLisbon_data$ID <- as.character(HotelLisbon_data$ID)

# Convert BookingsCanceled, BookingsNoShowed, BookingsCheckedIn, CheckedInCategory,
# CanceledCategory, NoShowCategory, and CustomerOutcome to factors
vars_to_factor <- c("BookingsCanceled", "BookingsNoShowed", "BookingsCheckedIn",
                    "CheckedInCategory", "CanceledCategory", "NoShowCategory", "CustomerOutcome")

HotelLisbon_data[vars_to_factor] <- lapply(HotelLisbon_data[vars_to_factor], as.factor)

# Display the class of the variables
sapply(HotelLisbon_data, class)
##                   ID                  Age       LodgingRevenue 
##          "character"            "numeric"            "numeric" 
##         OtherRevenue     BookingsCanceled     BookingsNoShowed 
##            "numeric"             "factor"             "factor" 
##    BookingsCheckedIn        PersonsNights           RoomNights 
##             "factor"            "numeric"            "numeric" 
##  DistributionChannel        MarketSegment          SRHighFloor 
##             "factor"             "factor"             "factor" 
##           SRLowFloor     SRAccessibleRoom        SRMediumFloor 
##             "factor"             "factor"             "factor" 
##            SRBathtub             SRShower               SRCrib 
##             "factor"             "factor"             "factor" 
##        SRKingSizeBed            SRTwinBed       SRNearElevator 
##             "factor"             "factor"             "factor" 
##   SRAwayFromElevator SRNoAlcoholInMiniBar          SRQuietRoom 
##             "factor"             "factor"             "factor" 
##    CheckedInCategory     CanceledCategory       NoShowCategory 
##             "factor"             "factor"             "factor" 
##      CustomerOutcome            Continent 
##             "factor"             "factor"

#correlation table for numeric values.

# Selected numeric variables
numeric_variables <- HotelLisbon_data[, sapply(HotelLisbon_data, is.numeric)]

# Create a correlation matrix
correlation_matrix <- cor(numeric_variables)

# Print the correlation matrix
print(correlation_matrix)
##                         Age LodgingRevenue OtherRevenue PersonsNights
## Age             1.000000000   -0.001258176   0.08892854    0.04338341
## LodgingRevenue -0.001258176    1.000000000   0.53779318    0.64978685
## OtherRevenue    0.088928543    0.537793184   1.00000000    0.53952786
## PersonsNights   0.043383409    0.649786850   0.53952786    1.00000000
## RoomNights      0.051863488    0.690830179   0.48239974    0.84794739
##                RoomNights
## Age            0.05186349
## LodgingRevenue 0.69083018
## OtherRevenue   0.48239974
## PersonsNights  0.84794739
## RoomNights     1.00000000

#let’s create pivot tables to explore the relationship between “CustomerOutcome” and the variables: “DistributionChannel,” “MarketSegment,” and “Continent.”

#Pivot table for DistributionChannel and CustomerOutcome:

pivot_table_distribution <- table(HotelLisbon_data$DistributionChannel, HotelLisbon_data$CustomerOutcome)

# Display the pivot table
pivot_table_distribution
##                          
##                               0     1
##   Corporate                 304  2036
##   Direct                   3051  7663
##   Electronic Distribution    17   439
##   Travel Agent/Operator   14648 46842
# Pivot table for MarketSegment and CustomerOutcome
pivot_table_market <- table(HotelLisbon_data$MarketSegment, HotelLisbon_data$CustomerOutcome)

# Display the pivot table
pivot_table_market
##                        
##                             0     1
##   Aviation                 24   197
##   Complementary           107   346
##   Corporate               238  1701
##   Direct                 2972  7340
##   Groups                 1471  7048
##   Other                 11045 32001
##   Travel Agent/Operator  2163  8347
# Pivot table for Continent and CustomerOutcome
pivot_table_continent <- table(HotelLisbon_data$Continent, HotelLisbon_data$CustomerOutcome)

# Display the pivot table
pivot_table_continent
##     
##          0     1
##   AF   164   560
##   AN     4     6
##   AS   709  2106
##   EU 14644 48400
##   NA  1440  3249
##   OC   208   527
##   SA   851  2132

#ANOVA Tests for Numerical Variables: Age, LodgingRevenue, OtherRevenue, PersonsNights, RoomNights

# ANOVA for Age
age_anova <- aov(Age ~ CustomerOutcome, data = HotelLisbon_data)
summary(age_anova)
##                    Df   Sum Sq Mean Sq F value Pr(>F)    
## CustomerOutcome     1    67013   67013   334.1 <2e-16 ***
## Residuals       74998 15042084     201                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for LodgingRevenue
LodgingRevenue_anova <- aov(LodgingRevenue ~ CustomerOutcome, data = HotelLisbon_data)
summary(LodgingRevenue_anova)
##                    Df    Sum Sq   Mean Sq F value Pr(>F)    
## CustomerOutcome     1 2.021e+09 2.021e+09   17898 <2e-16 ***
## Residuals       74998 8.469e+09 1.129e+05                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for OtherRevenue
OtherRevenue_anova <- aov(OtherRevenue ~ CustomerOutcome, data = HotelLisbon_data)
summary(OtherRevenue_anova)
##                    Df    Sum Sq   Mean Sq F value Pr(>F)    
## CustomerOutcome     1 104542978 104542978    9641 <2e-16 ***
## Residuals       74998 813288511     10844                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for PersonsNights
PersonsNights_anova <- aov(PersonsNights ~ CustomerOutcome, data = HotelLisbon_data)
summary(PersonsNights_anova)
##                    Df  Sum Sq Mean Sq F value Pr(>F)    
## CustomerOutcome     1  493983  493983   34844 <2e-16 ***
## Residuals       74998 1063257      14                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for RoomNights
RoomNights_anova <- aov(RoomNights ~ CustomerOutcome, data = HotelLisbon_data)
summary(RoomNights_anova)
##                    Df Sum Sq Mean Sq F value Pr(>F)    
## CustomerOutcome     1 123150  123150   34787 <2e-16 ***
## Residuals       74998 265498       4                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Chi-Square Tests for Categorical Variables:

# Chi-square test for DistributionChannel
distribution_chi2 <- chisq.test(table(HotelLisbon_data$DistributionChannel, HotelLisbon_data$CustomerOutcome))
print(distribution_chi2)
## 
##  Pearson's Chi-squared test
## 
## data:  table(HotelLisbon_data$DistributionChannel, HotelLisbon_data$CustomerOutcome)
## X-squared = 376.69, df = 3, p-value < 2.2e-16
# Chi-square test for MarketSegment
segment_chi2 <- chisq.test(table(HotelLisbon_data$MarketSegment, HotelLisbon_data$CustomerOutcome))
print(segment_chi2)
## 
##  Pearson's Chi-squared test
## 
## data:  table(HotelLisbon_data$MarketSegment, HotelLisbon_data$CustomerOutcome)
## X-squared = 642, df = 6, p-value < 2.2e-16
# Chi-square test for Continent
continent_chi2 <- chisq.test(table(HotelLisbon_data$Continent, HotelLisbon_data$CustomerOutcome))
## Warning in chisq.test(table(HotelLisbon_data$Continent,
## HotelLisbon_data$CustomerOutcome)): Chi-squared approximation may be incorrect
print(continent_chi2)
## 
##  Pearson's Chi-squared test
## 
## data:  table(HotelLisbon_data$Continent, HotelLisbon_data$CustomerOutcome)
## X-squared = 181.45, df = 6, p-value < 2.2e-16
colnames(HotelLisbon_data)
##  [1] "ID"                   "Age"                  "LodgingRevenue"      
##  [4] "OtherRevenue"         "BookingsCanceled"     "BookingsNoShowed"    
##  [7] "BookingsCheckedIn"    "PersonsNights"        "RoomNights"          
## [10] "DistributionChannel"  "MarketSegment"        "SRHighFloor"         
## [13] "SRLowFloor"           "SRAccessibleRoom"     "SRMediumFloor"       
## [16] "SRBathtub"            "SRShower"             "SRCrib"              
## [19] "SRKingSizeBed"        "SRTwinBed"            "SRNearElevator"      
## [22] "SRAwayFromElevator"   "SRNoAlcoholInMiniBar" "SRQuietRoom"         
## [25] "CheckedInCategory"    "CanceledCategory"     "NoShowCategory"      
## [28] "CustomerOutcome"      "Continent"

#Let’s confirm if the Boolean viariables are infact variables:

binary_vars <- c("SRHighFloor", "SRLowFloor", "SRAccessibleRoom", "SRMediumFloor",
                 "SRBathtub", "SRShower", "SRCrib", "SRKingSizeBed", "SRTwinBed",
                 "SRNearElevator", "SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")

for (var in binary_vars) {
  result <- table(HotelLisbon_data[[var]])
  print(paste("Variable:", var))
  print(result)
  if (length(unique(HotelLisbon_data[[var]])) == 2) {
    print("This is a binary variable.")
  } else {
    print("This is not a binary variable.")
  }
  cat("\n")
}
## [1] "Variable: SRHighFloor"
## 
##     0     1 
## 71435  3565 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRLowFloor"
## 
##     0     1 
## 74897   103 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRAccessibleRoom"
## 
##     0     1 
## 74979    21 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRMediumFloor"
## 
##     0     1 
## 74933    67 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRBathtub"
## 
##     0     1 
## 74784   216 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRShower"
## 
##     0     1 
## 74866   134 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRCrib"
## 
##     0     1 
## 74006   994 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRKingSizeBed"
## 
##     0     1 
## 48583 26417 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRTwinBed"
## 
##     0     1 
## 64277 10723 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRNearElevator"
## 
##     0     1 
## 74973    27 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRAwayFromElevator"
## 
##     0     1 
## 74729   271 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRNoAlcoholInMiniBar"
## 
##     0     1 
## 74990    10 
## [1] "This is a binary variable."
## 
## [1] "Variable: SRQuietRoom"
## 
##     0     1 
## 68398  6602 
## [1] "This is a binary variable."

Visualisations:

# Load ggplot2 library
library(ggplot2)

# Set a color palette for better visualization
colors <- c("#1f78b4", "#33a02c")
# Box plots for numerical variables
numerical_vars <- c("Age", "LodgingRevenue", "OtherRevenue", "PersonsNights", "RoomNights")
for (var in numerical_vars) {
  p <- ggplot(HotelLisbon_data, aes(x = CustomerOutcome, y = get(var), fill = factor(CustomerOutcome))) +
    geom_boxplot() +
    labs(title = paste("Box plot for", var, "vs CustomerOutcome"),
         x = "CustomerOutcome",
         y = var) +
    scale_fill_manual(values = colors) +
    theme_minimal() +
    theme(legend.position = "none")
  
  print(p)
}

# Bar plots for categorical variables
categorical_vars <- c("BookingsCanceled", "BookingsNoShowed", "BookingsCheckedIn",
                      "DistributionChannel", "MarketSegment", "Continent")
for (var in categorical_vars) {
  p <- ggplot(HotelLisbon_data, aes(x = factor(get(var)), fill = factor(CustomerOutcome))) +
    geom_bar(position = "dodge") +
    labs(title = paste("Bar plot for", var, "vs CustomerOutcome"),
         x = var,
         y = "Count") +
    scale_fill_manual(values = colors) +
    theme_minimal()
  
  print(p)
}

# Grouped box plots for mixed variables
mixed_vars <- c("CheckedInCategory", "CanceledCategory", "NoShowCategory", "MarketSegment")
for (var in mixed_vars) {
  p <- ggplot(HotelLisbon_data, aes(x = factor(get(var)), y = Age, fill = factor(CustomerOutcome))) +
    geom_boxplot() +
    labs(title = paste("Box plot for Age vs", var, "vs CustomerOutcome"),
         x = var,
         y = "Age") +
    scale_fill_manual(values = colors) +
    theme_minimal() +
    theme(legend.position = "none")
  
  print(p)
}

# Load ggplot2 library
library(ggplot2)

# Set a color palette for better visualization
colors <- c("#1f78b4", "#33a02c")

# Binary variables
binary_vars <- c("SRHighFloor", "SRLowFloor", "SRAccessibleRoom", "SRMediumFloor",
                 "SRBathtub", "SRShower", "SRCrib", "SRKingSizeBed", "SRTwinBed",
                 "SRNearElevator", "SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")

# Visualize binary variables against CustomerOutcome
#Customer Outcome
for (var in binary_vars) {
  p <- ggplot(HotelLisbon_data, aes(x = factor(get(var)), fill = factor(CustomerOutcome))) +
    geom_bar(position = "dodge") +
    labs(title = paste("Bar plot for", var, "vs CustomerOutcome"),
         x = var,
         y = "Count") +
    scale_fill_manual(values = colors) +
    theme_minimal()
  
  print(p)
}

# Load ggplot2 library
library(ggplot2)

# Set a color palette for better visualization
colors <- c("#1f78b4", "#33a02c")

# Binary variables
binary_vars <- c("SRHighFloor", "SRLowFloor", "SRAccessibleRoom", "SRMediumFloor",
                 "SRBathtub", "SRShower", "SRCrib", "SRKingSizeBed", "SRTwinBed",
                 "SRNearElevator", "SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")

# Other variables
other_vars <- c("MarketSegment", "CheckedInCategory", "PersonsNights")

# Visualize binary variables against MarketSegment, CheckedInCategory, and PersonsNight
for (other_var in other_vars) {
  for (var in binary_vars) {
    p <- ggplot(HotelLisbon_data, aes(x = factor(get(var)), fill = factor(CustomerOutcome))) +
      geom_bar(position = "dodge") +
      facet_grid(paste(". ~", other_var)) +
      labs(title = paste("Bar plot for", var, "vs CustomerOutcome by", other_var),
           x = var,
           y = "Count") +
      scale_fill_manual(values = colors) +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    
    print(p)
  }
}

# Load ggplot2 library
library(ggplot2)

# Set a color palette for better visualization
colors <- c("#1f78b4", "#33a02c")

# Categorical variables
categorical_vars <- c("DistributionChannel", "MarketSegment", "Continent", "CheckedInCategory")

# Visualize categorical variables against CustomerOutcome
for (var in categorical_vars) {
  p <- ggplot(HotelLisbon_data, aes(x = factor(get(var)), fill = factor(CustomerOutcome))) +
    geom_bar(position = "dodge") +
    labs(title = paste("Bar plot for", var, "vs CustomerOutcome"),
         x = var,
         y = "Count") +
    scale_fill_manual(values = colors) +
    theme_minimal()

  print(p)
}

# Numeric variables
numeric_vars <- c("Age", "LodgingRevenue", "OtherRevenue", "PersonsNights", "RoomNights")

# Visualize numeric variables against CustomerOutcome using box plots
for (var in numeric_vars) {
  p <- ggplot(HotelLisbon_data, aes(x = factor(CustomerOutcome), y = get(var), fill = factor(CustomerOutcome))) +
    geom_boxplot() +
    labs(title = paste("Box plot for", var, "vs CustomerOutcome"),
         x = "CustomerOutcome",
         y = var) +
    scale_fill_manual(values = colors) +
    theme_minimal()

  print(p)
}

#Bar Plot for Market Segments: Visualizing the number of customers in each market segment:

library(ggplot2)

ggplot(HotelLisbon_data, aes(x = MarketSegment)) +
  geom_bar(fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Number of Customers in Each Market Segment", x = "Market Segment", y = "Count")

#Histogram for Bookings checked in

ggplot(HotelLisbon_data, aes(x = as.factor(BookingsCheckedIn), y = LodgingRevenue)) +
  geom_boxplot(outlier.color = "red", fill = "lightgreen") +
  labs(title = "Lodging Revenue by Bookings Checked In", x = "Bookings Checked In", y = "Lodging Revenue") +
  scale_x_discrete(labels = c("0" = "No", "1" = "Yes"))

ggplot(HotelLisbon_data, aes(x = as.factor(BookingsNoShowed), fill = as.factor(BookingsCheckedIn))) +
  geom_bar(position = "dodge") +
  labs(title = "Bookings No Shows vs. Bookings Checked In", x = "Bookings No Shows", y = "Count", fill = "Checked In") +
  scale_fill_discrete(name = "Checked In", labels = c("No", "Yes"))

ggplot(HotelLisbon_data, aes(x = as.factor(BookingsCanceled), fill = as.factor(BookingsCheckedIn))) +
  geom_bar(position = "dodge") +
  labs(title = "Bookings Cancelled vs. Bookings Checked In", x = "Bookings Cancelled", y = "Count", fill = "Checked In") +
  scale_fill_discrete(name = "Checked In", labels = c("No", "Yes"))

ggplot(HotelLisbon_data, aes(x = MarketSegment, fill = as.factor(BookingsCheckedIn))) +
  geom_bar(position = "stack") +
  labs(title = "Market Segment and Bookings Checked In", x = "Market Segment", y = "Count", fill = "Checked In") +
  scale_fill_discrete(name = "Checked In", labels = c("No", "Yes"))

ggplot(HotelLisbon_data, aes(x = LodgingRevenue, fill = as.factor(BookingsCheckedIn))) +
  geom_histogram(position = "identity", alpha = 0.5, bins = 30) +
  labs(title = "Lodging Revenue by Bookings Checked In", x = "Lodging Revenue", y = "Frequency") +
  scale_fill_discrete(name = "Checked In", labels = c("No", "Yes"))

# Replace 'AnotherNumericVar' with the actual variable name
ggplot(HotelLisbon_data, aes(x = MarketSegment, y = LodgingRevenue, color = as.factor(BookingsCheckedIn))) +
  geom_point(alpha = 0.6) +
  labs(title = "Scatter Plot of Lodging Revenue vs. Market Segment", x = "Market Segment", y = "Lodging Revenue") +
  scale_color_discrete(name = "Checked In", labels = c("No", "Yes"))

#There are skewed graphs and outliers that we need to resolve

# Boxplot to identify outliers
boxplot(HotelLisbon_data$LodgingRevenue, main = "Boxplot for Lodging Revenue")

boxplot(HotelLisbon_data$OtherRevenue, main = "Boxplot for Other Revenue")

# Boxplot to identify outliers in Market Segment adn Distribution channel
boxplot(HotelLisbon_data$MarketSegment, main = "Boxplot for Market Segment")

boxplot(HotelLisbon_data$DistributionChannel, main = "Boxplot for Distribution cannels")

#There are potential outliers in Other Revenue, Bookings canceled and bookings no shows. We would need to further discuss this with the professor. Do we use these variables or not. We are confused here.

Now we need to Balance our data #Check Frequency of the Dependent Variable

table(HotelLisbon_data$CustomerOutcome)
## 
##     0     1 
## 18020 56980

#0 represents customers who canceled or were No shows at the Hotel #1 represents customers who stayed at the hotel

#Visualize the class distribution using a bar plot or pie chart

# Assuming your dependent variable is named "CustomerOutcome"
library(ggplot2)

ggplot(HotelLisbon_data, aes(x = CustomerOutcome)) +
  geom_bar() +
  labs(title = "Class Distribution of CustomerOutcome")

#0 is definetly our minorty class and we must do Under Sampling here

#Let’s finally check Class Proportions

prop.table(table(HotelLisbon_data$CustomerOutcome))
## 
##         0         1 
## 0.2402667 0.7597333
colnames(HotelLisbon_data)
##  [1] "ID"                   "Age"                  "LodgingRevenue"      
##  [4] "OtherRevenue"         "BookingsCanceled"     "BookingsNoShowed"    
##  [7] "BookingsCheckedIn"    "PersonsNights"        "RoomNights"          
## [10] "DistributionChannel"  "MarketSegment"        "SRHighFloor"         
## [13] "SRLowFloor"           "SRAccessibleRoom"     "SRMediumFloor"       
## [16] "SRBathtub"            "SRShower"             "SRCrib"              
## [19] "SRKingSizeBed"        "SRTwinBed"            "SRNearElevator"      
## [22] "SRAwayFromElevator"   "SRNoAlcoholInMiniBar" "SRQuietRoom"         
## [25] "CheckedInCategory"    "CanceledCategory"     "NoShowCategory"      
## [28] "CustomerOutcome"      "Continent"

#Balancing the Data Set

# Install and load necessary packages
install.packages("ROSE")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'ROSE' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
install.packages("caret")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
install.packages("pROC")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'pROC' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'pROC'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Latitude\AppData\Local\R\win-library\4.3\00LOCK\pROC\libs\x64\pROC.dll
## to C:\Users\Latitude\AppData\Local\R\win-library\4.3\pROC\libs\x64\pROC.dll:
## Permission denied
## Warning: restored 'pROC'
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.3.2
## Loaded ROSE 0.0-4
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.2
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# Undersampling using ROSE
UnderSampled_HotelLisbon <- ovun.sample(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + BookingsCanceled + BookingsNoShowed + BookingsCheckedIn + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom +      CheckedInCategory + CanceledCategory + NoShowCategory
, data = HotelLisbon_data, method = "under", N = 2 * sum(HotelLisbon_data$CustomerOutcome == 0))$data

head(UnderSampled_HotelLisbon)
##   Age LodgingRevenue OtherRevenue BookingsCanceled BookingsNoShowed
## 1  39         692.76       112.40                0                0
## 2  49         257.40       197.50                0                0
## 3  66         349.28        50.76                0                0
## 4  39         471.00        42.00                0                0
## 5  33        1561.50       333.00                0                0
## 6  42         544.00        56.00                0                0
##   BookingsCheckedIn PersonsNights RoomNights   DistributionChannel
## 1                 1            10          5 Travel Agent/Operator
## 2                 1             6          2 Travel Agent/Operator
## 3                 1             8          4 Travel Agent/Operator
## 4                 1             6          3 Travel Agent/Operator
## 5                 1            10          5                Direct
## 6                 1             8          4 Travel Agent/Operator
##           MarketSegment SRHighFloor SRLowFloor SRAccessibleRoom SRMediumFloor
## 1                 Other           0          0                0             0
## 2                 Other           0          0                0             0
## 3 Travel Agent/Operator           0          0                0             0
## 4                 Other           1          0                0             0
## 5                Direct           0          0                0             0
## 6                 Other           0          0                0             0
##   SRBathtub SRShower SRCrib SRKingSizeBed SRTwinBed SRNearElevator
## 1         0        0      0             0         0              0
## 2         0        0      0             1         0              0
## 3         0        0      0             0         0              0
## 4         0        0      0             1         0              0
## 5         0        0      0             0         0              0
## 6         0        0      0             1         0              0
##   SRAwayFromElevator SRNoAlcoholInMiniBar SRQuietRoom CheckedInCategory
## 1                  0                    0           0       Stayed Once
## 2                  0                    0           0       Stayed Once
## 3                  0                    0           0       Stayed Once
## 4                  0                    0           0       Stayed Once
## 5                  0                    0           0       Stayed Once
## 6                  0                    0           0       Stayed Once
##        CanceledCategory                 NoShowCategory CustomerOutcome
## 1 Checked In and Stayed Customers Showed up and Stayed               1
## 2 Checked In and Stayed Customers Showed up and Stayed               1
## 3 Checked In and Stayed Customers Showed up and Stayed               1
## 4 Checked In and Stayed Customers Showed up and Stayed               1
## 5 Checked In and Stayed Customers Showed up and Stayed               1
## 6 Checked In and Stayed Customers Showed up and Stayed               1
##   Continent
## 1        EU
## 2        EU
## 3        EU
## 4        EU
## 5        EU
## 6        EU
# Checking the balance of the undersampled data. If it is indeed balanced. 
table(UnderSampled_HotelLisbon$CustomerOutcome)
## 
##     1     0 
## 18020 18020
# and the Proportions are. Drum Roll, Please!
prop.table(table(UnderSampled_HotelLisbon$CustomerOutcome))
## 
##   1   0 
## 0.5 0.5

Let’s hope it goes all well from here

#Ok, so now we have to do the traing and testing. With our dataset and Dependent variable: Customer Outcome.

set.seed(123) 
trainIndex <- createDataPartition(UnderSampled_HotelLisbon$CustomerOutcome, p = 0.6, list = FALSE)
TrainData <- UnderSampled_HotelLisbon[trainIndex, ]
TestData <- UnderSampled_HotelLisbon[-trainIndex, ]
# Correlation matrix for training data
cor_matrix <- cor(TrainData[, sapply(TrainData, is.numeric)])
print("Correlation Matrix for Training Data:")
## [1] "Correlation Matrix for Training Data:"
print(cor_matrix)
##                       Age LodgingRevenue OtherRevenue PersonsNights RoomNights
## Age            1.00000000      0.0233162    0.0890071     0.0607649 0.05711301
## LodgingRevenue 0.02331620      1.0000000    0.6296349     0.6930740 0.77442878
## OtherRevenue   0.08900710      0.6296349    1.0000000     0.6125930 0.53863861
## PersonsNights  0.06076490      0.6930740    0.6125930     1.0000000 0.83178456
## RoomNights     0.05711301      0.7744288    0.5386386     0.8317846 1.00000000
install.packages("ggpubr")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'ggpubr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.3.2
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
## 
##     mutate
library(tidyverse)
## Warning: package 'purrr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ tibble  3.2.1
## ✔ purrr   1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()      masks plyr::arrange()
## ✖ data.table::between() masks dplyr::between()
## ✖ purrr::compact()      masks plyr::compact()
## ✖ dplyr::count()        masks plyr::count()
## ✖ dplyr::desc()         masks plyr::desc()
## ✖ dplyr::failwith()     masks plyr::failwith()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ data.table::first()   masks dplyr::first()
## ✖ data.table::hour()    masks lubridate::hour()
## ✖ dplyr::id()           masks plyr::id()
## ✖ data.table::isoweek() masks lubridate::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ data.table::last()    masks dplyr::last()
## ✖ purrr::lift()         masks caret::lift()
## ✖ data.table::mday()    masks lubridate::mday()
## ✖ data.table::minute()  masks lubridate::minute()
## ✖ data.table::month()   masks lubridate::month()
## ✖ ggpubr::mutate()      masks dplyr::mutate(), plyr::mutate()
## ✖ data.table::quarter() masks lubridate::quarter()
## ✖ dplyr::rename()       masks plyr::rename()
## ✖ data.table::second()  masks lubridate::second()
## ✖ dplyr::summarise()    masks plyr::summarise()
## ✖ dplyr::summarize()    masks plyr::summarize()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ data.table::wday()    masks lubridate::wday()
## ✖ data.table::week()    masks lubridate::week()
## ✖ data.table::yday()    masks lubridate::yday()
## ✖ data.table::year()    masks lubridate::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(devtools)
## Warning: package 'devtools' was built under R version 4.3.1
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.3.1
install.packages("psych")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'psych' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
library(psych)
## Warning: package 'psych' was built under R version 4.3.2
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
install.packages("PerformanceAnalytics")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'PerformanceAnalytics' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.3.2
## Loading required package: xts
## Warning: package 'xts' was built under R version 4.3.1
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:data.table':
## 
##     first, last
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
library("ggplot2")

install.packages("rcompanion")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'rcompanion' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
library(rcompanion)
## Warning: package 'rcompanion' was built under R version 4.3.2
## 
## Attaching package: 'rcompanion'
## 
## The following object is masked from 'package:psych':
## 
##     phi
pairs(data = HotelLisbon_data,
      ~ Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights)

# let's transform them into log numeric variables are Age, LodgingRevenue, OtherRevenue, PersonsNights, and RoomNights

log_transformed_data <- log1p(HotelLisbon_data[, c("Age", "LodgingRevenue", "OtherRevenue", "PersonsNights", "RoomNights")])

pairs(data = log_transformed_data,
      ~ Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights)

# Q-Q plot for a variable
qqnorm(HotelLisbon_data$Age)
qqline(HotelLisbon_data$Age)

ggqqplot(HotelLisbon_data$Age, ylab = "Age")

# Q-Q plot for a variable
qqnorm(HotelLisbon_data$LodgingRevenue)
qqline(HotelLisbon_data$LodgingRevenue)

ggqqplot(HotelLisbon_data$LodgingRevenue, ylab = "Lodging revenue")

# Q-Q plot for a variable
qqnorm(TrainData$LodgingRevenue)
qqline(TrainData$LodgingRevenue)

ggqqplot(TrainData$LodgingRevenue, ylab = "Lodging revenue")

qqnorm(HotelLisbon_data$PersonsNights)
qqline(HotelLisbon_data$PersonsNights)

ggqqplot(HotelLisbon_data$PersonsNights, ylab = "Persons Per Night")

qqnorm(HotelLisbon_data$RoomNights)
qqline(HotelLisbon_data$RoomNights)

ggqqplot(HotelLisbon_data$RoomNights, ylab = "Rooms per night")

cor_test_result <- cor.test(HotelLisbon_data$Age, HotelLisbon_data$LodgingRevenue, method = "pearson")
print(cor_test_result)
## 
##  Pearson's product-moment correlation
## 
## data:  HotelLisbon_data$Age and HotelLisbon_data$LodgingRevenue
## t = -0.34456, df = 74998, p-value = 0.7304
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.008414898  0.005898674
## sample estimates:
##          cor 
## -0.001258176
cor_test_result <- cor.test(HotelLisbon_data$RoomNights, HotelLisbon_data$LodgingRevenue, method = "pearson")
print(cor_test_result)
## 
##  Pearson's product-moment correlation
## 
## data:  HotelLisbon_data$RoomNights and HotelLisbon_data$LodgingRevenue
## t = 261.67, df = 74998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6870703 0.6945530
## sample estimates:
##       cor 
## 0.6908302
cor_test_result <- cor.test(HotelLisbon_data$Age, HotelLisbon_data$OtherRevenue, method = "pearson")
print(cor_test_result)
## 
##  Pearson's product-moment correlation
## 
## data:  HotelLisbon_data$Age and HotelLisbon_data$OtherRevenue
## t = 24.451, df = 74998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.08182382 0.09602423
## sample estimates:
##        cor 
## 0.08892854
install.packages("corrplot")
## Installing package into 'C:/Users/Latitude/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'corrplot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Latitude\AppData\Local\Temp\RtmpyaDZku\downloaded_packages
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
correlation_matrix <- cor(HotelLisbon_data[, sapply(HotelLisbon_data, is.numeric)])
corrplot(correlation_matrix, method = "color")

#redo this part for better models Let’s do Logistic regression On Dependent Variable CustomerOutcome with its regressors

# Logistic regression model 1
logistic_model1 <- glm(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom +  CheckedInCategory , data = TrainData, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_model1)
## 
## Call:
## glm(formula = CustomerOutcome ~ Continent + Age + LodgingRevenue + 
##     OtherRevenue + PersonsNights + RoomNights + DistributionChannel + 
##     MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + 
##     SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + 
##     SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + 
##     SRQuietRoom + CheckedInCategory, family = "binomial", data = TrainData)
## 
## Coefficients:
##                                              Estimate Std. Error z value
## (Intercept)                                 5.852e+01  2.687e+03   0.022
## ContinentAN                                -1.713e+01  1.317e+04  -0.001
## ContinentAS                                -1.610e+00  1.318e+00  -1.222
## ContinentEU                                -1.843e+00  8.577e-01  -2.149
## ContinentNA                                -2.000e+00  1.122e+00  -1.782
## ContinentOC                                -1.849e+01  3.530e+03  -0.005
## ContinentSA                                -1.801e+01  1.935e+03  -0.009
## Age                                         3.157e-02  1.334e-02   2.366
## LodgingRevenue                              2.634e-04  3.110e-04   0.847
## OtherRevenue                               -1.073e-03  1.715e-03  -0.626
## PersonsNights                              -1.144e-01  7.612e-02  -1.503
## RoomNights                                  1.526e-01  8.589e-02   1.777
## DistributionChannelDirect                   9.882e-01  8.264e-01   1.196
## DistributionChannelElectronic Distribution -1.827e+01  3.250e+03  -0.006
## DistributionChannelTravel Agent/Operator   -8.370e-01  8.562e-01  -0.978
## MarketSegmentComplementary                 -2.331e+00  1.055e+00  -2.209
## MarketSegmentCorporate                     -1.555e+00  5.147e-01  -3.021
## MarketSegmentDirect                        -4.046e+00  1.000e+00  -4.045
## MarketSegmentGroups                        -4.198e+00  1.132e+00  -3.707
## MarketSegmentOther                         -6.680e+00  1.416e+00  -4.718
## MarketSegmentTravel Agent/Operator         -2.089e+01  1.753e+03  -0.012
## SRHighFloor1                                2.981e-01  9.068e-01   0.329
## SRLowFloor1                                -2.426e+01  1.235e+05   0.000
## SRAccessibleRoom1                          -7.792e+00  3.349e+03  -0.002
## SRMediumFloor1                             -5.587e-02  1.128e+00  -0.050
## SRBathtub1                                 -1.522e+01  4.895e+03  -0.003
## SRShower1                                  -1.707e+01  1.228e+04  -0.001
## SRCrib1                                    -1.532e+01  1.933e+03  -0.008
## SRKingSizeBed1                              6.717e-01  3.576e-01   1.878
## SRTwinBed1                                  5.255e-01  5.814e-01   0.904
## SRNearElevator1                            -1.167e+01  3.351e+03  -0.003
## SRAwayFromElevator1                         3.518e-01  1.571e+00   0.224
## SRNoAlcoholInMiniBar1                      -1.160e+01  9.623e+04   0.000
## SRQuietRoom1                                1.400e+00  7.109e-01   1.969
## CheckedInCategoryGood Loyal Customers      -5.649e+01  2.687e+03  -0.021
## CheckedInCategoryGreat Loyal Customers     -3.137e+01  7.475e+04   0.000
## CheckedInCategoryStayed Once               -6.030e+01  2.687e+03  -0.022
## CheckedInCategoryStayed Thrice             -5.785e+01  2.687e+03  -0.022
## CheckedInCategoryStayed Twice              -5.843e+01  2.687e+03  -0.022
##                                            Pr(>|z|)    
## (Intercept)                                 0.98263    
## ContinentAN                                 0.99896    
## ContinentAS                                 0.22180    
## ContinentEU                                 0.03163 *  
## ContinentNA                                 0.07477 .  
## ContinentOC                                 0.99582    
## ContinentSA                                 0.99257    
## Age                                         0.01799 *  
## LodgingRevenue                              0.39693    
## OtherRevenue                                0.53142    
## PersonsNights                               0.13272    
## RoomNights                                  0.07554 .  
## DistributionChannelDirect                   0.23180    
## DistributionChannelElectronic Distribution  0.99551    
## DistributionChannelTravel Agent/Operator    0.32828    
## MarketSegmentComplementary                  0.02714 *  
## MarketSegmentCorporate                      0.00252 ** 
## MarketSegmentDirect                        5.24e-05 ***
## MarketSegmentGroups                         0.00021 ***
## MarketSegmentOther                         2.38e-06 ***
## MarketSegmentTravel Agent/Operator          0.99049    
## SRHighFloor1                                0.74238    
## SRLowFloor1                                 0.99984    
## SRAccessibleRoom1                           0.99814    
## SRMediumFloor1                              0.96050    
## SRBathtub1                                  0.99752    
## SRShower1                                   0.99889    
## SRCrib1                                     0.99368    
## SRKingSizeBed1                              0.06035 .  
## SRTwinBed1                                  0.36603    
## SRNearElevator1                             0.99722    
## SRAwayFromElevator1                         0.82284    
## SRNoAlcoholInMiniBar1                       0.99990    
## SRQuietRoom1                                0.04893 *  
## CheckedInCategoryGood Loyal Customers       0.98323    
## CheckedInCategoryGreat Loyal Customers      0.99967    
## CheckedInCategoryStayed Once                0.98210    
## CheckedInCategoryStayed Thrice              0.98282    
## CheckedInCategoryStayed Twice               0.98265    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29977.23  on 21623  degrees of freedom
## Residual deviance:   377.85  on 21585  degrees of freedom
## AIC: 455.85
## 
## Number of Fisher Scoring iterations: 24
# Logistic regression model 2 , If we take out CheckedInCategory 
log_model2 <- glm(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRNoAlcoholInMiniBar + SRLowFloor + SRAccessibleRoom + SRMediumFloor , data = TrainData, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_model2)
## 
## Call:
## glm(formula = CustomerOutcome ~ Continent + Age + LodgingRevenue + 
##     OtherRevenue + PersonsNights + RoomNights + DistributionChannel + 
##     MarketSegment + SRHighFloor + SRNoAlcoholInMiniBar + SRLowFloor + 
##     SRAccessibleRoom + SRMediumFloor, family = "binomial", data = TrainData)
## 
## Coefficients:
##                                              Estimate Std. Error z value
## (Intercept)                                 3.823e+00  5.909e-01   6.469
## ContinentAN                                 2.135e-01  2.335e+00   0.091
## ContinentAS                                 7.529e-02  3.309e-01   0.228
## ContinentEU                                 1.911e-01  2.928e-01   0.653
## ContinentNA                                 4.079e-01  3.198e-01   1.275
## ContinentOC                                 6.535e-01  4.695e-01   1.392
## ContinentSA                                 3.328e-01  3.475e-01   0.958
## Age                                         7.551e-04  2.460e-03   0.307
## LodgingRevenue                             -1.091e-02  6.971e-04 -15.656
## OtherRevenue                               -2.616e-03  1.804e-03  -1.450
## PersonsNights                              -2.145e+00  1.271e-01 -16.879
## RoomNights                                  1.093e+00  1.507e-01   7.249
## DistributionChannelDirect                   8.358e-01  3.772e-01   2.216
## DistributionChannelElectronic Distribution -2.411e+00  4.266e-01  -5.653
## DistributionChannelTravel Agent/Operator    3.513e-01  2.462e-01   1.427
## MarketSegmentComplementary                 -2.914e+00  6.518e-01  -4.471
## MarketSegmentCorporate                     -2.268e+00  5.180e-01  -4.379
## MarketSegmentDirect                        -1.925e+00  6.261e-01  -3.075
## MarketSegmentGroups                        -2.118e+00  5.535e-01  -3.826
## MarketSegmentOther                         -1.336e+00  5.569e-01  -2.399
## MarketSegmentTravel Agent/Operator         -1.545e+00  5.606e-01  -2.756
## SRHighFloor1                                4.938e-01  2.010e-01   2.457
## SRNoAlcoholInMiniBar1                       2.358e+01  1.768e+05   0.000
## SRLowFloor1                                 5.928e+00  1.495e+01   0.396
## SRAccessibleRoom1                           3.188e+00  2.165e+01   0.147
## SRMediumFloor1                              4.069e+00  1.566e+00   2.599
##                                            Pr(>|z|)    
## (Intercept)                                9.88e-11 ***
## ContinentAN                                 0.92714    
## ContinentAS                                 0.82000    
## ContinentEU                                 0.51408    
## ContinentNA                                 0.20215    
## ContinentOC                                 0.16394    
## ContinentSA                                 0.33819    
## Age                                         0.75884    
## LodgingRevenue                              < 2e-16 ***
## OtherRevenue                                0.14702    
## PersonsNights                               < 2e-16 ***
## RoomNights                                 4.19e-13 ***
## DistributionChannelDirect                   0.02671 *  
## DistributionChannelElectronic Distribution 1.58e-08 ***
## DistributionChannelTravel Agent/Operator    0.15357    
## MarketSegmentComplementary                 7.79e-06 ***
## MarketSegmentCorporate                     1.19e-05 ***
## MarketSegmentDirect                         0.00211 ** 
## MarketSegmentGroups                         0.00013 ***
## MarketSegmentOther                          0.01643 *  
## MarketSegmentTravel Agent/Operator          0.00585 ** 
## SRHighFloor1                                0.01402 *  
## SRNoAlcoholInMiniBar1                       0.99989    
## SRLowFloor1                                 0.69177    
## SRAccessibleRoom1                           0.88296    
## SRMediumFloor1                              0.00936 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29977.2  on 21623  degrees of freedom
## Residual deviance:  5365.9  on 21598  degrees of freedom
## AIC: 5417.9
## 
## Number of Fisher Scoring iterations: 25
#CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + BookingsCanceled + BookingsNoShowed + BookingsCheckedIn + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
#SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom +      CheckedInCategory + CanceledCategory + NoShowCategory

#Let’s put all the Bookings categories we have made, CheckedIncategory, CanceledCategory, NoShowCategory.

# Logistic regression model 3 , If we put Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment  
log_model3 <- glm(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment, data = TrainData, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_model3)
## 
## Call:
## glm(formula = CustomerOutcome ~ Continent + Age + LodgingRevenue + 
##     OtherRevenue + PersonsNights + RoomNights + DistributionChannel + 
##     MarketSegment, family = "binomial", data = TrainData)
## 
## Coefficients:
##                                              Estimate Std. Error z value
## (Intercept)                                 3.8192390  0.5898755   6.475
## ContinentAN                                 0.1984379  2.3236437   0.085
## ContinentAS                                 0.0815647  0.3292165   0.248
## ContinentEU                                 0.1925815  0.2909054   0.662
## ContinentNA                                 0.4143379  0.3180803   1.303
## ContinentOC                                 0.6390838  0.4669915   1.369
## ContinentSA                                 0.3202585  0.3457592   0.926
## Age                                         0.0008545  0.0024578   0.348
## LodgingRevenue                             -0.0108116  0.0006940 -15.578
## OtherRevenue                               -0.0025540  0.0017983  -1.420
## PersonsNights                              -2.1304467  0.1266094 -16.827
## RoomNights                                  1.0644637  0.1503720   7.079
## DistributionChannelDirect                   0.8166977  0.3761026   2.171
## DistributionChannelElectronic Distribution -2.4448630  0.4268271  -5.728
## DistributionChannelTravel Agent/Operator    0.3454092  0.2461992   1.403
## MarketSegmentComplementary                 -2.8876589  0.6514647  -4.433
## MarketSegmentCorporate                     -2.2437190  0.5176771  -4.334
## MarketSegmentDirect                        -1.8874251  0.6252497  -3.019
## MarketSegmentGroups                        -2.1112440  0.5534893  -3.814
## MarketSegmentOther                         -1.3022145  0.5568200  -2.339
## MarketSegmentTravel Agent/Operator         -1.5352934  0.5605173  -2.739
##                                            Pr(>|z|)    
## (Intercept)                                9.50e-11 ***
## ContinentAN                                0.931944    
## ContinentAS                                0.804325    
## ContinentEU                                0.507967    
## ContinentNA                                0.192704    
## ContinentOC                                0.171152    
## ContinentSA                                0.354318    
## Age                                        0.728090    
## LodgingRevenue                              < 2e-16 ***
## OtherRevenue                               0.155541    
## PersonsNights                               < 2e-16 ***
## RoomNights                                 1.45e-12 ***
## DistributionChannelDirect                  0.029895 *  
## DistributionChannelElectronic Distribution 1.02e-08 ***
## DistributionChannelTravel Agent/Operator   0.160627    
## MarketSegmentComplementary                 9.31e-06 ***
## MarketSegmentCorporate                     1.46e-05 ***
## MarketSegmentDirect                        0.002539 ** 
## MarketSegmentGroups                        0.000136 ***
## MarketSegmentOther                         0.019353 *  
## MarketSegmentTravel Agent/Operator         0.006161 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29977.2  on 21623  degrees of freedom
## Residual deviance:  5390.1  on 21603  degrees of freedom
## AIC: 5432.1
## 
## Number of Fisher Scoring iterations: 25
#CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + BookingsCanceled + BookingsNoShowed + BookingsCheckedIn + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
#SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom +      CheckedInCategory + CanceledCategory + NoShowCategory

#Let’s try Model 4. With Only Binary variables.

# Logistic regression model 4 , With only binary Variables / Booleans   
log_model4 <- glm(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom, data = TrainData, family = "binomial")

summary(log_model4)
## 
## Call:
## glm(formula = CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + 
##     SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + 
##     SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + 
##     SRQuietRoom, family = "binomial", data = TrainData)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.10543    0.01980  -5.326 1.00e-07 ***
## SRHighFloor1           0.25085    0.06515   3.851 0.000118 ***
## SRLowFloor1            0.60581    0.37601   1.611 0.107147    
## SRAccessibleRoom1     -1.09758    1.14890  -0.955 0.339409    
## SRMediumFloor1        -0.25992    0.39160  -0.664 0.506853    
## SRBathtub1             0.07514    0.24518   0.306 0.759253    
## SRShower1              0.24147    0.34240   0.705 0.480675    
## SRCrib1                1.11269    0.12530   8.881  < 2e-16 ***
## SRKingSizeBed1         0.10753    0.03012   3.570 0.000357 ***
## SRTwinBed1             0.21566    0.04006   5.384 7.29e-08 ***
## SRNearElevator1       -0.66872    0.71470  -0.936 0.349445    
## SRAwayFromElevator1   -0.05315    0.23931  -0.222 0.824243    
## SRNoAlcoholInMiniBar1 11.27424   94.71115   0.119 0.905245    
## SRQuietRoom1           0.06041    0.04781   1.264 0.206369    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29977  on 21623  degrees of freedom
## Residual deviance: 29824  on 21610  degrees of freedom
## AIC: 29852
## 
## Number of Fisher Scoring iterations: 10

#Let’s do Model 5 with All Binary variables and Include, Market Segment, DistributionChannel

# Logistic regression model 5 ,  
log_model5 <- glm(CustomerOutcome ~ LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + MarketSegment + DistributionChannel, data = TrainData, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_model5)
## 
## Call:
## glm(formula = CustomerOutcome ~ LodgingRevenue + OtherRevenue + 
##     PersonsNights + RoomNights + MarketSegment + DistributionChannel, 
##     family = "binomial", data = TrainData)
## 
## Coefficients:
##                                              Estimate Std. Error z value
## (Intercept)                                 4.0992667  0.5021481   8.163
## LodgingRevenue                             -0.0107943  0.0006919 -15.601
## OtherRevenue                               -0.0026011  0.0018020  -1.443
## PersonsNights                              -2.1044774  0.1252628 -16.801
## RoomNights                                  1.0247596  0.1489883   6.878
## MarketSegmentComplementary                 -2.9087354  0.6523584  -4.459
## MarketSegmentCorporate                     -2.2730208  0.5194814  -4.376
## MarketSegmentDirect                        -1.8878310  0.6258659  -3.016
## MarketSegmentGroups                        -2.1003539  0.5536895  -3.793
## MarketSegmentOther                         -1.3227111  0.5568945  -2.375
## MarketSegmentTravel Agent/Operator         -1.5566609  0.5609630  -2.775
## DistributionChannelDirect                   0.7879917  0.3750888   2.101
## DistributionChannelElectronic Distribution -2.4607901  0.4251392  -5.788
## DistributionChannelTravel Agent/Operator    0.3373434  0.2435941   1.385
##                                            Pr(>|z|)    
## (Intercept)                                3.26e-16 ***
## LodgingRevenue                              < 2e-16 ***
## OtherRevenue                               0.148904    
## PersonsNights                               < 2e-16 ***
## RoomNights                                 6.06e-12 ***
## MarketSegmentComplementary                 8.24e-06 ***
## MarketSegmentCorporate                     1.21e-05 ***
## MarketSegmentDirect                        0.002558 ** 
## MarketSegmentGroups                        0.000149 ***
## MarketSegmentOther                         0.017542 *  
## MarketSegmentTravel Agent/Operator         0.005521 ** 
## DistributionChannelDirect                  0.035657 *  
## DistributionChannelElectronic Distribution 7.11e-09 ***
## DistributionChannelTravel Agent/Operator   0.166096    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29977.2  on 21623  degrees of freedom
## Residual deviance:  5390.4  on 21610  degrees of freedom
## AIC: 5418.4
## 
## Number of Fisher Scoring iterations: 25
##CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + BookingsCanceled + BookingsNoShowed + BookingsCheckedIn + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
#SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom +      CheckedInCategory + CanceledCategory + NoShowCategory

#Let’s do Model 6 and include Binary variables, include, RoomNights and Market Segment and Distribution Channels.

# Logistic regression model 6 ,  
log_model6 <- glm(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + MarketSegment + DistributionChannel + RoomNights,  data = TrainData, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_model6)
## 
## Call:
## glm(formula = CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + 
##     SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + 
##     SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + 
##     SRQuietRoom + MarketSegment + DistributionChannel + RoomNights, 
##     family = "binomial", data = TrainData)
## 
## Coefficients:
##                                             Estimate Std. Error z value
## (Intercept)                                  5.29705    0.64571   8.203
## SRHighFloor1                                 0.18061    0.14507   1.245
## SRLowFloor1                                  1.63057    1.34703   1.210
## SRAccessibleRoom1                            0.34341    3.92452   0.088
## SRMediumFloor1                               1.22961    0.98854   1.244
## SRBathtub1                                   0.68804    0.75734   0.908
## SRShower1                                    0.68650    0.98371   0.698
## SRCrib1                                      0.59693    0.25551   2.336
## SRKingSizeBed1                               0.22533    0.06724   3.351
## SRTwinBed1                                   0.35564    0.09058   3.926
## SRNearElevator1                              0.28049    1.99185   0.141
## SRAwayFromElevator1                         -0.21686    0.49113  -0.442
## SRNoAlcoholInMiniBar1                        9.76415  159.90601   0.061
## SRQuietRoom1                                 0.17840    0.11047   1.615
## MarketSegmentComplementary                  -3.34833    0.79636  -4.205
## MarketSegmentCorporate                      -2.94284    0.66251  -4.442
## MarketSegmentDirect                         -3.11538    0.77026  -4.045
## MarketSegmentGroups                         -3.12507    0.70644  -4.424
## MarketSegmentOther                          -3.04930    0.70851  -4.304
## MarketSegmentTravel Agent/Operator          -2.81317    0.71206  -3.951
## DistributionChannelDirect                    0.22925    0.42153   0.544
## DistributionChannelElectronic Distribution  -1.62069    0.50634  -3.201
## DistributionChannelTravel Agent/Operator     0.18137    0.29380   0.617
## RoomNights                                  -3.05732    0.04700 -65.052
##                                            Pr(>|z|)    
## (Intercept)                                2.34e-16 ***
## SRHighFloor1                               0.213139    
## SRLowFloor1                                0.226092    
## SRAccessibleRoom1                          0.930272    
## SRMediumFloor1                             0.213549    
## SRBathtub1                                 0.363617    
## SRShower1                                  0.485255    
## SRCrib1                                    0.019481 *  
## SRKingSizeBed1                             0.000805 ***
## SRTwinBed1                                 8.62e-05 ***
## SRNearElevator1                            0.888012    
## SRAwayFromElevator1                        0.658810    
## SRNoAlcoholInMiniBar1                      0.951310    
## SRQuietRoom1                               0.106332    
## MarketSegmentComplementary                 2.62e-05 ***
## MarketSegmentCorporate                     8.92e-06 ***
## MarketSegmentDirect                        5.24e-05 ***
## MarketSegmentGroups                        9.70e-06 ***
## MarketSegmentOther                         1.68e-05 ***
## MarketSegmentTravel Agent/Operator         7.79e-05 ***
## DistributionChannelDirect                  0.586546    
## DistributionChannelElectronic Distribution 0.001370 ** 
## DistributionChannelTravel Agent/Operator   0.537016    
## RoomNights                                  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29977.2  on 21623  degrees of freedom
## Residual deviance:  6047.5  on 21600  degrees of freedom
## AIC: 6095.5
## 
## Number of Fisher Scoring iterations: 11

#Let’s do Model 7 with Bianry variable and only include RoomNights and PersonsNights

# Logistic regression model 7 ,  
log_model7 <- glm(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + PersonsNights + RoomNights,  data = TrainData, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_model7)
## 
## Call:
## glm(formula = CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + 
##     SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + 
##     SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + 
##     SRQuietRoom + PersonsNights + RoomNights, family = "binomial", 
##     data = TrainData)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.574e+00  4.703e-02  54.721  < 2e-16 ***
## SRHighFloor1           3.177e-01  1.948e-01   1.631  0.10292    
## SRLowFloor1            5.735e+00  1.297e+01   0.442  0.65824    
## SRAccessibleRoom1      1.213e+00  1.398e+01   0.087  0.93083    
## SRMediumFloor1         3.737e+00  1.591e+00   2.349  0.01882 *  
## SRBathtub1             4.363e-01  9.293e-01   0.470  0.63868    
## SRShower1              1.246e+00  1.567e+00   0.795  0.42651    
## SRCrib1                1.239e+00  4.604e-01   2.691  0.00712 ** 
## SRKingSizeBed1         4.613e-01  7.414e-02   6.223 4.88e-10 ***
## SRTwinBed1             8.814e-01  1.286e-01   6.852 7.26e-12 ***
## SRNearElevator1        1.327e+00  2.533e+00   0.524  0.60029    
## SRAwayFromElevator1    4.184e-01  8.214e-01   0.509  0.61051    
## SRNoAlcoholInMiniBar1  2.280e+01  1.401e+05   0.000  0.99987    
## SRQuietRoom1           9.458e-02  1.244e-01   0.760  0.44705    
## PersonsNights         -2.156e+00  1.290e-01 -16.722  < 2e-16 ***
## RoomNights            -1.389e-01  1.397e-01  -0.994  0.32017    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29977.2  on 21623  degrees of freedom
## Residual deviance:  5532.1  on 21608  degrees of freedom
## AIC: 5564.1
## 
## Number of Fisher Scoring iterations: 25

Now that we have 7 different Logistic Models.

#From chapter 9 Use this code example : confusionMatrix(deeper.ct.point.pred.train, factor(train.df$Personal.Loan))

library(pROC)

Model 1

# Predictions on test data 1
predictions1 <- predict(logistic_model1, newdata = TestData, type = "response")

# Evaluate model performance
roc_curve1 <- roc(TestData$CustomerOutcome, predictions1)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score1 <- auc(roc_curve1)

# Confusion matrix
conf_matrix1 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions1 > 0.5, 1, 0))
print("Confusion Matrix Model 1:")
## [1] "Confusion Matrix Model 1:"
print(conf_matrix1)
##       Predicted
## Actual    0    1
##      1 7193   15
##      0   28 7180
auc_score1
## Area under the curve: 0.9995
#cut off 0.5
#Model 1
library(caret) # Load the caret package

predicted_classes <- as.factor(ifelse(predictions1 > 0.5, "1", "0"))

# Convert actual outcomes to factor
actual_classes <- as.factor(TestData$CustomerOutcome)

# Ensure consistent levels
levels(predicted_classes) <- levels(actual_classes) <- c("0", "1")

# Create confusion matrix using the caret package's function
conf_matrix1A <- confusionMatrix(predicted_classes, actual_classes)
print("Confusion Matrix Model 1A:")
## [1] "Confusion Matrix Model 1A:"
print(conf_matrix1A)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7193   28
##          1   15 7180
##                                          
##                Accuracy : 0.997          
##                  95% CI : (0.996, 0.9978)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.994          
##                                          
##  Mcnemar's Test P-Value : 0.06725        
##                                          
##             Sensitivity : 0.9979         
##             Specificity : 0.9961         
##          Pos Pred Value : 0.9961         
##          Neg Pred Value : 0.9979         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4990         
##    Detection Prevalence : 0.5009         
##       Balanced Accuracy : 0.9970         
##                                          
##        'Positive' Class : 0              
## 

#TrainData <- UnderSampled_HotelLisbon[trainIndex, ] #TestData <- UnderSampled_HotelLisbon[-trainIndex, ]

Model 2

# Predictions on test data 2
predictions2 <- predict(log_model2, newdata = TestData, type = "response")

# Evaluate model performance
roc_curve2 <- roc(TestData$CustomerOutcome, predictions2)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score2 <- auc(roc_curve2)

# Confusion matrix
conf_matrix2 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions2 > 0.5, 1, 0))
print("Confusion Matrix Model 2:")
## [1] "Confusion Matrix Model 2:"
print(conf_matrix2)
##       Predicted
## Actual    0    1
##      1 6869  339
##      0   51 7157
auc_score2
## Area under the curve: 0.9953
#Model 2 cut off 0.5
predicted_classes2 <- as.factor(ifelse(predictions2 > 0.5, "1", "0"))


actual_classes2 <- as.factor(TestData$CustomerOutcome)

levels(predicted_classes2) <- levels(actual_classes2) <- c("0", "1")

conf_matrix2A <- confusionMatrix(predicted_classes2, actual_classes2)
print("Confusion Matrix Model 2A:")
## [1] "Confusion Matrix Model 2A:"
print(conf_matrix2A)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6869   51
##          1  339 7157
##                                           
##                Accuracy : 0.9729          
##                  95% CI : (0.9702, 0.9755)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9459          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9530          
##             Specificity : 0.9929          
##          Pos Pred Value : 0.9926          
##          Neg Pred Value : 0.9548          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4765          
##    Detection Prevalence : 0.4800          
##       Balanced Accuracy : 0.9729          
##                                           
##        'Positive' Class : 0               
## 

Model 3

# Predictions on test data 3
predictions3 <- predict(log_model3, newdata = TestData, type = "response")

# Evaluate model performance
roc_curve3 <- roc(TestData$CustomerOutcome, predictions3)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score3 <- auc(roc_curve3)

# Confusion matrix
conf_matrix3 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions3 > 0.5, 1, 0))
print("Confusion Matrix Model 3:")
## [1] "Confusion Matrix Model 3:"
print(conf_matrix3)
##       Predicted
## Actual    0    1
##      1 6870  338
##      0   51 7157
auc_score3
## Area under the curve: 0.9956
#Model 3 cut off 0.5

predicted_classes3 <- as.factor(ifelse(predictions3 > 0.5, "1", "0"))

actual_classes3 <- as.factor(TestData$CustomerOutcome)


levels(predicted_classes3) <- levels(actual_classes3) <- c("0", "1")


conf_matrix3A <- confusionMatrix(predicted_classes3, actual_classes3)
print("Confusion Matrix Model 3A:")
## [1] "Confusion Matrix Model 3A:"
print(conf_matrix3A)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6870   51
##          1  338 7157
##                                           
##                Accuracy : 0.973           
##                  95% CI : (0.9702, 0.9756)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.946           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9531          
##             Specificity : 0.9929          
##          Pos Pred Value : 0.9926          
##          Neg Pred Value : 0.9549          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4766          
##    Detection Prevalence : 0.4801          
##       Balanced Accuracy : 0.9730          
##                                           
##        'Positive' Class : 0               
## 

Model 4

# Predictions on test data
predictions4 <- predict(log_model4, newdata = TestData, type = "response")

# Evaluate model performance
roc_curve4 <- roc(TestData$CustomerOutcome, predictions4)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score4 <- auc(roc_curve4)

# Confusion matrix
conf_matrix4 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions4 > 0.5, 1, 0))
print("Confusion Matrix Model 4:")
## [1] "Confusion Matrix Model 4:"
print(conf_matrix4)
##       Predicted
## Actual    0    1
##      1 3613 3595
##      0 3175 4033
auc_score4
## Area under the curve: 0.5407
#Model 4 Cut off 0.5

predicted_classes4 <- as.factor(ifelse(predictions4 > 0.5, "1", "0"))


actual_classes4 <- as.factor(TestData$CustomerOutcome)


levels(predicted_classes4) <- levels(actual_classes4) <- c("0", "1")

conf_matrix4A <- confusionMatrix(predicted_classes4, actual_classes4)
print("Confusion Matrix Model 4A:")
## [1] "Confusion Matrix Model 4A:"
print(conf_matrix4A)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3613 3175
##          1 3595 4033
##                                           
##                Accuracy : 0.5304          
##                  95% CI : (0.5222, 0.5386)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 1.552e-13       
##                                           
##                   Kappa : 0.0608          
##                                           
##  Mcnemar's Test P-Value : 3.536e-07       
##                                           
##             Sensitivity : 0.5012          
##             Specificity : 0.5595          
##          Pos Pred Value : 0.5323          
##          Neg Pred Value : 0.5287          
##              Prevalence : 0.5000          
##          Detection Rate : 0.2506          
##    Detection Prevalence : 0.4709          
##       Balanced Accuracy : 0.5304          
##                                           
##        'Positive' Class : 0               
## 

Model 5

# Predictions on test data
predictions5 <- predict(log_model5, newdata = TestData, type = "response")

# Evaluate model performance
roc_curve5 <- roc(TestData$CustomerOutcome, predictions5)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score5 <- auc(roc_curve5)

# Confusion matrix
conf_matrix5 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions5 > 0.5, 1, 0))
print("Confusion Matrix Model 5:")
## [1] "Confusion Matrix Model 5:"
print(conf_matrix5)
##       Predicted
## Actual    0    1
##      1 6868  340
##      0   51 7157
auc_score5
## Area under the curve: 0.9956
#Model 5 cut iff 0.5

predicted_classes5 <- as.factor(ifelse(predictions5 > 0.5, "1", "0"))

actual_classes5 <- as.factor(TestData$CustomerOutcome)


levels(predicted_classes5) <- levels(actual_classes5) <- c("0", "1")


conf_matrix5A <- confusionMatrix(predicted_classes5, actual_classes5)
print("Confusion Matrix Model 5A:")
## [1] "Confusion Matrix Model 5A:"
print(conf_matrix5A)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6868   51
##          1  340 7157
##                                           
##                Accuracy : 0.9729          
##                  95% CI : (0.9701, 0.9755)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9458          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9528          
##             Specificity : 0.9929          
##          Pos Pred Value : 0.9926          
##          Neg Pred Value : 0.9546          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4764          
##    Detection Prevalence : 0.4800          
##       Balanced Accuracy : 0.9729          
##                                           
##        'Positive' Class : 0               
## 

Model 6

# Predictions on test data
predictions6 <- predict(log_model6, newdata = TestData, type = "response")

# Evaluate model performance
roc_curve6 <- roc(TestData$CustomerOutcome, predictions6)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score6 <- auc(roc_curve6)

# Confusion matrix
conf_matrix6 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions6 > 0.5, 1, 0))
print("Confusion Matrix Model 6:")
## [1] "Confusion Matrix Model 6:"
print(conf_matrix6)
##       Predicted
## Actual    0    1
##      1 7194   14
##      0   54 7154
auc_score6
## Area under the curve: 0.9946
#Model 6


predicted_classes6 <- as.factor(ifelse(predictions6 > 0.5, "1", "0"))


actual_classes6 <- as.factor(TestData$CustomerOutcome)


levels(predicted_classes6) <- levels(actual_classes6) <- c("0", "1")


conf_matrix6A <- confusionMatrix(predicted_classes6, actual_classes6)
print("Confusion Matrix Model 6A:")
## [1] "Confusion Matrix Model 6A:"
print(conf_matrix6A)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7194   54
##          1   14 7154
##                                          
##                Accuracy : 0.9953         
##                  95% CI : (0.994, 0.9963)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9906         
##                                          
##  Mcnemar's Test P-Value : 2.251e-06      
##                                          
##             Sensitivity : 0.9981         
##             Specificity : 0.9925         
##          Pos Pred Value : 0.9925         
##          Neg Pred Value : 0.9980         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4990         
##    Detection Prevalence : 0.5028         
##       Balanced Accuracy : 0.9953         
##                                          
##        'Positive' Class : 0              
## 

Model 7

# Predictions on test data
predictions7 <- predict(log_model7, newdata = TestData, type = "response")

# Evaluate model performance
roc_curve7 <- roc(TestData$CustomerOutcome, predictions7)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score7 <- auc(roc_curve7)

# Confusion matrix
conf_matrix7 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions7 > 0.5, 1, 0))
print("Confusion Matrix Model 7:")
## [1] "Confusion Matrix Model 7:"
print(conf_matrix7)
##       Predicted
## Actual    0    1
##      1 6729  479
##      0   50 7158
auc_score7
## Area under the curve: 0.9953
predicted_classes7 <- as.factor(ifelse(predictions7 > 0.5, "1", "0"))


actual_classes7 <- as.factor(TestData$CustomerOutcome)


levels(predicted_classes7) <- levels(actual_classes7) <- c("0", "1")

conf_matrix7A <- confusionMatrix(predicted_classes7, actual_classes7)
print("Confusion Matrix Model 7A:")
## [1] "Confusion Matrix Model 7A:"
print(conf_matrix7A)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6729   50
##          1  479 7158
##                                           
##                Accuracy : 0.9633          
##                  95% CI : (0.9601, 0.9663)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9266          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9335          
##             Specificity : 0.9931          
##          Pos Pred Value : 0.9926          
##          Neg Pred Value : 0.9373          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4668          
##    Detection Prevalence : 0.4702          
##       Balanced Accuracy : 0.9633          
##                                           
##        'Positive' Class : 0               
## 

#performance metrics including accuracy, precision, recall, F1-score, and AUC for each model. Below are the codes for the specified models:

# Model 1
accuracy_model1 <- sum(diag(conf_matrix1)) / sum(conf_matrix1)
precision_model1 <- conf_matrix1[2, 2] / sum(conf_matrix1[2, ])
recall_model1 <- conf_matrix1[2, 2] / sum(conf_matrix1[, 2])
f1_score_model1 <- 2 * precision_model1 * recall_model1 / (precision_model1 + recall_model1)

roc_curve_model1 <- roc(TestData$CustomerOutcome, predictions1)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model1 <- auc(roc_curve_model1)

# Display metrics
cat("Model 1 Metrics:\n")
## Model 1 Metrics:
cat("Accuracy:", accuracy_model1, "\n")
## Accuracy: 0.9970172
cat("Precision:", precision_model1, "\n")
## Precision: 0.9961154
cat("Recall:", recall_model1, "\n")
## Recall: 0.9979152
cat("F1-Score:", f1_score_model1, "\n")
## F1-Score: 0.9970145
cat("AUC:", auc_score_model1, "\n\n")
## AUC: 0.9994538
# Model 2
accuracy_model2 <- sum(diag(conf_matrix2)) / sum(conf_matrix2)
precision_model2 <- conf_matrix2[2, 2] / sum(conf_matrix2[2, ])
recall_model2 <- conf_matrix2[2, 2] / sum(conf_matrix2[, 2])
f1_score_model2 <- 2 * precision_model2 * recall_model2 / (precision_model2 + recall_model2)

roc_curve_model2 <- roc(TestData$CustomerOutcome, predictions2)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model2 <- auc(roc_curve_model2)

# Display metrics
cat("Model 2 Metrics:\n")
## Model 2 Metrics:
cat("Accuracy:", accuracy_model2, "\n")
## Accuracy: 0.9729467
cat("Precision:", precision_model2, "\n")
## Precision: 0.9929245
cat("Recall:", recall_model2, "\n")
## Recall: 0.9547759
cat("F1-Score:", f1_score_model2, "\n")
## F1-Score: 0.9734766
cat("AUC:", auc_score_model2, "\n\n")
## AUC: 0.9953376
# Model 4
accuracy_model4 <- sum(diag(conf_matrix4)) / sum(conf_matrix4)
precision_model4 <- conf_matrix4[2, 2] / sum(conf_matrix4[2, ])
recall_model4 <- conf_matrix4[2, 2] / sum(conf_matrix4[, 2])
f1_score_model4 <- 2 * precision_model4 * recall_model4 / (precision_model4 + recall_model4)

roc_curve_model4 <- roc(TestData$CustomerOutcome, predictions4)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model4 <- auc(roc_curve_model4)

# Display metrics
cat("Model 4 Metrics:\n")
## Model 4 Metrics:
cat("Accuracy:", accuracy_model4, "\n")
## Accuracy: 0.5303829
cat("Precision:", precision_model4, "\n")
## Precision: 0.5595172
cat("Recall:", recall_model4, "\n")
## Recall: 0.52871
cat("F1-Score:", f1_score_model4, "\n")
## F1-Score: 0.5436775
cat("AUC:", auc_score_model4, "\n\n")
## AUC: 0.5407153
# Model 5
accuracy_model5 <- sum(diag(conf_matrix5)) / sum(conf_matrix5)
precision_model5 <- conf_matrix5[2, 2] / sum(conf_matrix5[2, ])
recall_model5 <- conf_matrix5[2, 2] / sum(conf_matrix5[, 2])
f1_score_model5 <- 2 * precision_model5 * recall_model5 / (precision_model5 + recall_model5)

roc_curve_model5 <- roc(TestData$CustomerOutcome, predictions5)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model5 <- auc(roc_curve_model5)

# Display metrics
cat("Model 5 Metrics:\n")
## Model 5 Metrics:
cat("Accuracy:", accuracy_model5, "\n")
## Accuracy: 0.9728774
cat("Precision:", precision_model5, "\n")
## Precision: 0.9929245
cat("Recall:", recall_model5, "\n")
## Recall: 0.9546485
cat("F1-Score:", f1_score_model5, "\n")
## F1-Score: 0.9734104
cat("AUC:", auc_score_model5, "\n\n")
## AUC: 0.9956395
# Model 6
accuracy_model6 <- sum(diag(conf_matrix6)) / sum(conf_matrix6)
precision_model6 <- conf_matrix6[2, 2] / sum(conf_matrix6[2, ])
recall_model6 <- conf_matrix6[2, 2] / sum(conf_matrix6[, 2])
f1_score_model6 <- 2 * precision_model6 * recall_model6 / (precision_model6 + recall_model6)

roc_curve_model6 <- roc(TestData$CustomerOutcome, predictions6)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model6 <- auc(roc_curve_model6)

# Display metrics
cat("Model 6 Metrics:\n")
## Model 6 Metrics:
cat("Accuracy:", accuracy_model6, "\n")
## Accuracy: 0.995283
cat("Precision:", precision_model6, "\n")
## Precision: 0.9925083
cat("Recall:", recall_model6, "\n")
## Recall: 0.9980469
cat("F1-Score:", f1_score_model6, "\n")
## F1-Score: 0.9952699
cat("AUC:", auc_score_model6, "\n\n")
## AUC: 0.9945955
# Model 7
accuracy_model7 <- sum(diag(conf_matrix7)) / sum(conf_matrix7)
precision_model7 <- conf_matrix7[2, 2] / sum(conf_matrix7[2, ])
recall_model7 <- conf_matrix7[2, 2] / sum(conf_matrix7[, 2])
f1_score_model7 <- 2 * precision_model7 * recall_model7 / (precision_model7 + recall_model7)

roc_curve_model7 <- roc(TestData$CustomerOutcome, predictions7)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model7 <- auc(roc_curve_model7)

# Display metrics
cat("Model 7 Metrics:\n")
## Model 7 Metrics:
cat("Accuracy:", accuracy_model7, "\n")
## Accuracy: 0.9633047
cat("Precision:", precision_model7, "\n")
## Precision: 0.9930633
cat("Recall:", recall_model7, "\n")
## Recall: 0.937279
cat("F1-Score:", f1_score_model7, "\n")
## F1-Score: 0.9643651
cat("AUC:", auc_score_model7, "\n\n")
## AUC: 0.9952783

#generating the decile lift curve, bar chart, and goodness of fit statistics for each model:

library(caret)
library(pROC)
library(gains)
## Warning: package 'gains' was built under R version 4.3.1

#CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + CheckedInCategory

#Model 1

TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0

# Add predictions to TestData for Model 1
TestData$predicted_prob = predictions1
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)

# Aggregate data
decile_analysis1 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)

# Plot
ggplot(decile_analysis1, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
  geom_bar(stat = "identity", fill="steelblue") +
  labs(title = "Decile Analysis for Model 1", x = "Decile", y = "Count of No Shows or Cancellations")

#Model 2

TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0

# Add predictions to TestData for Model 2
TestData$predicted_prob = predictions2
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)

# Aggregate data
decile_analysis2 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)

# Plot
ggplot(decile_analysis2, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
  geom_bar(stat = "identity", fill="steelblue") +
  labs(title = "Decile Analysis for Model 2", x = "Decile", y = "Count of No Shows or Cancellations")

#Model 3

TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0

# Add predictions to TestData for Model 3
TestData$predicted_prob = predictions3
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)

# Aggregate data
decile_analysis3 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)

# Plot
ggplot(decile_analysis3, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
  geom_bar(stat = "identity", fill="steelblue") +
  labs(title = "Decile Analysis for Model 3", x = "Decile", y = "Count of No Shows or Cancellations")

#Model 4

TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0

# Add predictions to TestData for Model 4
TestData$predicted_prob = predictions4
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)

# Aggregate data
decile_analysis4 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)

# Plot
ggplot(decile_analysis4, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
  geom_bar(stat = "identity", fill="steelblue") +
  labs(title = "Decile Analysis for Model 4", x = "Decile", y = "Count of No Shows or Cancellations")

#Model 5
TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0

# Add predictions to TestData for Model 5
TestData$predicted_prob = predictions5
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)

# Aggregate data
decile_analysis5 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)

# Plot
ggplot(decile_analysis5, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
  geom_bar(stat = "identity", fill="steelblue") +
  labs(title = "Decile Analysis for Model 5", x = "Decile", y = "Count of No Shows or Cancellations")

#Model 6

TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0

# Add predictions to TestData for Model 6
TestData$predicted_prob = predictions6
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)

# Aggregate data
decile_analysis6 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)

# Plot
ggplot(decile_analysis6, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
  geom_bar(stat = "identity", fill="steelblue") +
  labs(title = "Decile Analysis for Model 6", x = "Decile", y = "Count of No Shows or Cancellations")

#Model 7

TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0

# Add predictions to TestData for Model 7
TestData$predicted_prob = predictions7
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)

# Aggregate data
decile_analysis7 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)

# Plot
ggplot(decile_analysis7, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
  geom_bar(stat = "identity", fill="steelblue") +
  labs(title = "Decile Analysis for Model 7", x = "Decile", y = "Count of No Shows or Cancellations")

NoN Parametric Analysis

#Using Random forest #using Log_model5

# Load the necessary library
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
# Train the Random Forest model
rf_model5 <- randomForest(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + MarketSegment + DistributionChannel, data = TrainData, ntree = 500)

# Make predictions on the test set
rf_predictions5 <- predict(rf_model5, newdata = TestData)

# Evaluate model performance
rf_conf_matrix5 <- table(TestData$CustomerOutcome, rf_predictions5)
rf_accuracy5 <- sum(diag(rf_conf_matrix5)) / sum(rf_conf_matrix5)

# Print the confusion matrix and accuracy
print(rf_conf_matrix5)
##    rf_predictions5
##        1    0
##   1 2115 5093
##   0 1268 5940
print(paste("Accuracy:", rf_accuracy5))
## [1] "Accuracy: 0.558754162042175"

#Using Lod_model6 # Load the necessary library

# Load the necessary library
library(randomForest)

# Train the Random Forest model
rf_model6 <- randomForest(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + MarketSegment + DistributionChannel + RoomNights, data = TrainData, ntree = 500)

# Make predictions on the test set
rf_predictions6 <- predict(rf_model6, newdata = TestData)

# Evaluate model performance
rf_conf_matrix6 <- table(TestData$CustomerOutcome, rf_predictions6)
rf_accuracy6 <- sum(diag(rf_conf_matrix6)) / sum(rf_conf_matrix6)

# Print the confusion matrix and accuracy
print(rf_conf_matrix6)
##    rf_predictions6
##        1    0
##   1 7204    4
##   0   44 7164
print(paste("Accuracy:", rf_accuracy6))
## [1] "Accuracy: 0.996670366259711"

#Usine Logistic_model1

# Load the necessary library
library(randomForest)

# Train the Random Forest model
rf_model1 <- randomForest(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom +  CheckedInCategory , data = TrainData, ntree = 500)

# Make predictions on the test set
rf_predictions1 <- predict(rf_model1, newdata = TestData)

# Evaluate model performance
rf_conf_matrix1 <- table(TestData$CustomerOutcome, rf_predictions1)
rf_accuracy1 <- sum(diag(rf_conf_matrix1)) / sum(rf_conf_matrix1)

# Print the confusion matrix and accuracy
print(rf_conf_matrix1)
##    rf_predictions1
##        1    0
##   1 7198   10
##   0   33 7175
print(paste("Accuracy:", rf_accuracy1))
## [1] "Accuracy: 0.997017203107658"

#Using Model 4

# Load the necessary library
library(randomForest)

# Train the Random Forest model
rf_model4 <- randomForest(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom
, data = TrainData, ntree = 500)

# Make predictions on the test set
rf_predictions4 <- predict(rf_model4, newdata = TestData)

# Evaluate model performance
rf_conf_matrix4 <- table(TestData$CustomerOutcome, rf_predictions4)
rf_accuracy4 <- sum(diag(rf_conf_matrix4)) / sum(rf_conf_matrix4)

# Print the confusion matrix and accuracy
print(rf_conf_matrix4)
##    rf_predictions4
##        1    0
##   1 3440 3768
##   0 2973 4235
print(paste("Accuracy:", rf_accuracy4))
## [1] "Accuracy: 0.532394561598224"

Let’s use Naive Bayes

#Model 5

library(e1071)
## Warning: package 'e1071' was built under R version 4.3.1
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:PerformanceAnalytics':
## 
##     kurtosis, skewness
nb_model5 <- naiveBayes(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + MarketSegment + DistributionChannel, data = TrainData)

nb_predictions5 <- predict(nb_model5, newdata = TestData)

nb_conf_matrix5 <- table(TestData$CustomerOutcome, nb_predictions5)
nb_accuracy5 <- sum(diag(nb_conf_matrix5)) / sum(nb_conf_matrix5)

print(nb_conf_matrix5)
##    nb_predictions5
##        1    0
##   1 3396 3812
##   0 2869 4339
print(paste("Naive Bayes Model 5 Accuracy:", nb_accuracy5))
## [1] "Naive Bayes Model 5 Accuracy: 0.536556603773585"

#Model 6

nb_model6 <- naiveBayes(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + MarketSegment + DistributionChannel + RoomNights, data = TrainData)

nb_predictions6 <- predict(nb_model6, newdata = TestData)

nb_conf_matrix6 <- table(TestData$CustomerOutcome, nb_predictions6)
nb_accuracy6 <- sum(diag(nb_conf_matrix6)) / sum(nb_conf_matrix6)

print(nb_conf_matrix6)
##    nb_predictions6
##        1    0
##   1 6020 1188
##   0  121 7087
print(paste("Naive Bayes Model 6 Accuracy:", nb_accuracy6))
## [1] "Naive Bayes Model 6 Accuracy: 0.909198113207547"

#Model 1

nb_model1 <- naiveBayes(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + CheckedInCategory, data = TrainData)

nb_predictions1 <- predict(nb_model1, newdata = TestData)

nb_conf_matrix1 <- table(TestData$CustomerOutcome, nb_predictions1)
nb_accuracy1 <- sum(diag(nb_conf_matrix1)) / sum(nb_conf_matrix1)

print(nb_conf_matrix1)
##    nb_predictions1
##        1    0
##   1 7193   15
##   0   51 7157
print(paste("Naive Bayes Model 1 Accuracy:", nb_accuracy1))
## [1] "Naive Bayes Model 1 Accuracy: 0.995421753607103"

#Model 4

nb_model4 <- naiveBayes(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + 
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom, data = TrainData)

nb_predictions4 <- predict(nb_model4, newdata = TestData)

nb_conf_matrix4 <- table(TestData$CustomerOutcome, nb_predictions4)
nb_accuracy4 <- sum(diag(nb_conf_matrix4)) / sum(nb_conf_matrix4)

print(nb_conf_matrix4)
##    nb_predictions4
##        1    0
##   1 5557 1651
##   0 5187 2021
print(paste("Naive Bayes Model 4 Accuracy:", nb_accuracy4))
## [1] "Naive Bayes Model 4 Accuracy: 0.525665926748058"

#Let’s make AUC and ROC curve

#Model 1

# Load the pROC library
library(pROC)

# For model 1
roc_curve_model1 <- roc(TestData$CustomerOutcome, predictions1)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model1 <- auc(roc_curve_model1)
plot(roc_curve_model1, main="ROC Curve for Model 1")

print(paste("AUC for Model 1:", auc_model1))
## [1] "AUC for Model 1: 0.999453780082804"
# Replace predictions1 with the appropriate predictions for other models
# and repeat the process to get the ROC curve and AUC for each.

#Model 2

# Model 2
roc_curve_model2 <- roc(TestData$CustomerOutcome, predictions2)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model2 <- auc(roc_curve_model2)
plot(roc_curve_model2, main="ROC Curve for Model 2")

print(paste("AUC for Model 2:", auc_model2))
## [1] "AUC for Model 2: 0.995337604289721"

#Model 4

# Model 4
roc_curve_model4 <- roc(TestData$CustomerOutcome, predictions4)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model4 <- auc(roc_curve_model4)
plot(roc_curve_model4, main="ROC Curve for Model 4")

print(paste("AUC for Model 4:", auc_model4))
## [1] "AUC for Model 4: 0.540715277666571"

#Model 5

# Model 5
roc_curve_model5 <- roc(TestData$CustomerOutcome, predictions5)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model5 <- auc(roc_curve_model5)
plot(roc_curve_model5, main="ROC Curve for Model 5")

print(paste("AUC for Model 5:", auc_model5))
## [1] "AUC for Model 5: 0.995639469756135"

#Model 6

# Model 6
roc_curve_model6 <- roc(TestData$CustomerOutcome, predictions6)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model6 <- auc(roc_curve_model6)
plot(roc_curve_model6, main="ROC Curve for Model 6")

print(paste("AUC for Model 6:", auc_model6))
## [1] "AUC for Model 6: 0.994595456198625"

#Model 7

# Model 7
roc_curve_model7 <- roc(TestData$CustomerOutcome, predictions7)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model7 <- auc(roc_curve_model7)
plot(roc_curve_model7, main="ROC Curve for Model 7")

print(paste("AUC for Model 7:", auc_model7))
## [1] "AUC for Model 7: 0.995278284025272"